From d56364954342b5c8debdec5dfdb23d35b2a13fa2 Mon Sep 17 00:00:00 2001 From: Aleksei Moiseitsev Date: Mon, 7 Jul 2025 11:36:04 +0000 Subject: [PATCH 1/2] update libxml to 2.14.4 --- contrib/libs/libxml/Copyright | 7 +- contrib/libs/libxml/HTMLparser.c | 7348 +++---- contrib/libs/libxml/HTMLtree.c | 323 +- contrib/libs/libxml/README.md | 138 +- contrib/libs/libxml/SAX.c | 180 - contrib/libs/libxml/SAX2.c | 1917 +- contrib/libs/libxml/TODO | 276 - contrib/libs/libxml/TODO_SCHEMAS | 31 - contrib/libs/libxml/buf.c | 1695 +- contrib/libs/libxml/buf.h | 72 - contrib/libs/libxml/c14n.c | 384 +- contrib/libs/libxml/catalog.c | 421 +- contrib/libs/libxml/chvalid.c | 4 +- contrib/libs/libxml/config-linux.h | 222 +- contrib/libs/libxml/debugXML.c | 1881 +- contrib/libs/libxml/dict.c | 1612 +- contrib/libs/libxml/elfgcchack.h | 17830 ---------------- contrib/libs/libxml/enc.h | 32 - contrib/libs/libxml/encoding.c | 5245 ++--- contrib/libs/libxml/entities.c | 1037 +- contrib/libs/libxml/error.c | 1373 +- contrib/libs/libxml/globals.c | 1464 +- contrib/libs/libxml/hash.c | 1918 +- contrib/libs/libxml/html5ent.inc | 1607 ++ .../libs/libxml/include/libxml/DOCBparser.h | 96 - .../libs/libxml/include/libxml/HTMLparser.h | 163 +- contrib/libs/libxml/include/libxml/HTMLtree.h | 43 +- contrib/libs/libxml/include/libxml/SAX.h | 159 +- contrib/libs/libxml/include/libxml/SAX2.h | 94 +- contrib/libs/libxml/include/libxml/c14n.h | 29 +- contrib/libs/libxml/include/libxml/catalog.h | 79 +- contrib/libs/libxml/include/libxml/chvalid.h | 22 +- contrib/libs/libxml/include/libxml/debugXML.h | 166 +- contrib/libs/libxml/include/libxml/dict.h | 31 +- contrib/libs/libxml/include/libxml/encoding.h | 268 +- contrib/libs/libxml/include/libxml/entities.h | 72 +- contrib/libs/libxml/include/libxml/globals.h | 503 +- contrib/libs/libxml/include/libxml/hash.h | 185 +- contrib/libs/libxml/include/libxml/list.h | 58 +- contrib/libs/libxml/include/libxml/nanoftp.h | 153 +- contrib/libs/libxml/include/libxml/nanohttp.h | 53 +- contrib/libs/libxml/include/libxml/parser.h | 938 +- .../libxml/include/libxml/parserInternals.h | 377 +- contrib/libs/libxml/include/libxml/pattern.h | 42 +- contrib/libs/libxml/include/libxml/relaxng.h | 69 +- .../libxml/include/libxml/schemasInternals.h | 13 +- .../libs/libxml/include/libxml/schematron.h | 43 +- contrib/libs/libxml/include/libxml/threads.h | 54 +- contrib/libs/libxml/include/libxml/tree.h | 523 +- contrib/libs/libxml/include/libxml/uri.h | 46 +- contrib/libs/libxml/include/libxml/valid.h | 243 +- contrib/libs/libxml/include/libxml/xinclude.h | 36 +- contrib/libs/libxml/include/libxml/xlink.h | 18 +- contrib/libs/libxml/include/libxml/xmlIO.h | 225 +- .../libs/libxml/include/libxml/xmlautomata.h | 65 +- contrib/libs/libxml/include/libxml/xmlerror.h | 121 +- .../libs/libxml/include/libxml/xmlexports.h | 113 +- .../libs/libxml/include/libxml/xmlmemory.h | 180 +- .../libs/libxml/include/libxml/xmlmodule.h | 14 +- .../libs/libxml/include/libxml/xmlreader.h | 202 +- .../libs/libxml/include/libxml/xmlregexp.h | 148 +- contrib/libs/libxml/include/libxml/xmlsave.h | 53 +- .../libs/libxml/include/libxml/xmlschemas.h | 84 +- .../libxml/include/libxml/xmlschemastypes.h | 75 +- .../libs/libxml/include/libxml/xmlstring.h | 62 +- .../libs/libxml/include/libxml/xmlunicode.h | 193 +- .../libs/libxml/include/libxml/xmlversion.h | 185 +- .../libs/libxml/include/libxml/xmlwriter.h | 169 +- contrib/libs/libxml/include/libxml/xpath.h | 117 +- .../libxml/include/libxml/xpathInternals.h | 253 +- contrib/libs/libxml/include/libxml/xpointer.h | 80 +- contrib/libs/libxml/include/private/buf.h | 45 + contrib/libs/libxml/include/private/cata.h | 13 + contrib/libs/libxml/include/private/dict.h | 74 + contrib/libs/libxml/include/private/enc.h | 19 + .../libs/libxml/include/private/entities.h | 42 + contrib/libs/libxml/include/private/error.h | 46 + contrib/libs/libxml/include/private/globals.h | 15 + contrib/libs/libxml/include/private/html.h | 14 + contrib/libs/libxml/include/private/io.h | 44 + contrib/libs/libxml/include/private/lint.h | 15 + contrib/libs/libxml/include/private/memory.h | 58 + contrib/libs/libxml/include/private/parser.h | 152 + contrib/libs/libxml/include/private/regexp.h | 23 + contrib/libs/libxml/include/private/save.h | 24 + contrib/libs/libxml/include/private/string.h | 13 + contrib/libs/libxml/include/private/threads.h | 61 + contrib/libs/libxml/include/private/tree.h | 22 + contrib/libs/libxml/include/private/unicode.h | 44 + .../libs/libxml/include/private/xinclude.h | 9 + contrib/libs/libxml/include/private/xpath.h | 16 + contrib/libs/libxml/include/private/xzlib.h | 32 + contrib/libs/libxml/iso8859x.inc | 730 + contrib/libs/libxml/legacy.c | 1343 -- contrib/libs/libxml/libxml.h | 165 +- contrib/libs/libxml/libxml2.syms | 2295 -- contrib/libs/libxml/lintmain.c | 14 + contrib/libs/libxml/list.c | 47 +- contrib/libs/libxml/nanoftp.c | 2118 -- contrib/libs/libxml/nanohttp.c | 414 +- contrib/libs/libxml/parser.c | 13739 ++++++------ contrib/libs/libxml/parserInternals.c | 3894 ++-- .../libxml/patches/format_string_vuln.patch | 38 - contrib/libs/libxml/patches/rand.patch | 52 - contrib/libs/libxml/patches/va_args.patch | 27 - contrib/libs/libxml/patches/yencoding.patch | 26 - contrib/libs/libxml/pattern.c | 505 +- contrib/libs/libxml/relaxng.c | 674 +- contrib/libs/libxml/save.h | 36 - contrib/libs/libxml/schematron.c | 1506 +- contrib/libs/libxml/shell.c | 1628 ++ contrib/libs/libxml/threads.c | 1049 +- contrib/libs/libxml/timsort.h | 2 + contrib/libs/libxml/tree.c | 6960 +++--- contrib/libs/libxml/uri.c | 1436 +- contrib/libs/libxml/valid.c | 2773 ++- contrib/libs/libxml/xinclude.c | 2530 +-- contrib/libs/libxml/xlink.c | 28 +- contrib/libs/libxml/xmlIO.c | 4256 ++-- contrib/libs/libxml/xmllint.c | 4050 ++-- contrib/libs/libxml/xmllint/ya.make | 2 +- contrib/libs/libxml/xmlmemory.c | 940 +- contrib/libs/libxml/xmlmodule.c | 196 +- contrib/libs/libxml/xmlreader.c | 2273 +- contrib/libs/libxml/xmlregexp.c | 1990 +- contrib/libs/libxml/xmlsave.c | 1590 +- contrib/libs/libxml/xmlschemas.c | 1674 +- contrib/libs/libxml/xmlschemastypes.c | 1438 +- contrib/libs/libxml/xmlstring.c | 289 +- contrib/libs/libxml/xmlunicode.c | 703 +- contrib/libs/libxml/xmlwriter.c | 181 +- contrib/libs/libxml/xpath.c | 5280 ++--- contrib/libs/libxml/xpointer.c | 3187 +-- contrib/libs/libxml/xzlib.c | 82 +- contrib/libs/libxml/ya.make | 22 +- contrib/libs/libxml/yencoding.cpp | 48 - contrib/libs/libxml/yencoding.h | 24 - 137 files changed, 46547 insertions(+), 80420 deletions(-) delete mode 100644 contrib/libs/libxml/SAX.c delete mode 100644 contrib/libs/libxml/TODO delete mode 100644 contrib/libs/libxml/TODO_SCHEMAS delete mode 100644 contrib/libs/libxml/buf.h delete mode 100644 contrib/libs/libxml/elfgcchack.h delete mode 100644 contrib/libs/libxml/enc.h create mode 100644 contrib/libs/libxml/html5ent.inc delete mode 100644 contrib/libs/libxml/include/libxml/DOCBparser.h create mode 100644 contrib/libs/libxml/include/private/buf.h create mode 100644 contrib/libs/libxml/include/private/cata.h create mode 100644 contrib/libs/libxml/include/private/dict.h create mode 100644 contrib/libs/libxml/include/private/enc.h create mode 100644 contrib/libs/libxml/include/private/entities.h create mode 100644 contrib/libs/libxml/include/private/error.h create mode 100644 contrib/libs/libxml/include/private/globals.h create mode 100644 contrib/libs/libxml/include/private/html.h create mode 100644 contrib/libs/libxml/include/private/io.h create mode 100644 contrib/libs/libxml/include/private/lint.h create mode 100644 contrib/libs/libxml/include/private/memory.h create mode 100644 contrib/libs/libxml/include/private/parser.h create mode 100644 contrib/libs/libxml/include/private/regexp.h create mode 100644 contrib/libs/libxml/include/private/save.h create mode 100644 contrib/libs/libxml/include/private/string.h create mode 100644 contrib/libs/libxml/include/private/threads.h create mode 100644 contrib/libs/libxml/include/private/tree.h create mode 100644 contrib/libs/libxml/include/private/unicode.h create mode 100644 contrib/libs/libxml/include/private/xinclude.h create mode 100644 contrib/libs/libxml/include/private/xpath.h create mode 100644 contrib/libs/libxml/include/private/xzlib.h create mode 100644 contrib/libs/libxml/iso8859x.inc delete mode 100644 contrib/libs/libxml/legacy.c delete mode 100644 contrib/libs/libxml/libxml2.syms create mode 100644 contrib/libs/libxml/lintmain.c delete mode 100644 contrib/libs/libxml/nanoftp.c delete mode 100644 contrib/libs/libxml/patches/format_string_vuln.patch delete mode 100644 contrib/libs/libxml/patches/rand.patch delete mode 100644 contrib/libs/libxml/patches/va_args.patch delete mode 100644 contrib/libs/libxml/patches/yencoding.patch delete mode 100644 contrib/libs/libxml/save.h create mode 100644 contrib/libs/libxml/shell.c delete mode 100644 contrib/libs/libxml/yencoding.cpp delete mode 100644 contrib/libs/libxml/yencoding.h diff --git a/contrib/libs/libxml/Copyright b/contrib/libs/libxml/Copyright index d61318502caf..8c0b7c15dd9b 100644 --- a/contrib/libs/libxml/Copyright +++ b/contrib/libs/libxml/Copyright @@ -1,8 +1,9 @@ -Except where otherwise noted in the source code (e.g. the files hash.c, -list.c and the trio files, which are covered by a similar licence but -with different Copyright notices) all the files are: +Except where otherwise noted in the source code (e.g. the files dict.c and +list.c, which are covered by a similar licence but with different Copyright +notices) all the files are: Copyright (C) 1998-2012 Daniel Veillard. All Rights Reserved. + Copyright (C) The Libxml2 Contributors. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/contrib/libs/libxml/HTMLparser.c b/contrib/libs/libxml/HTMLparser.c index 3e8a16574003..7388c84ea996 100644 --- a/contrib/libs/libxml/HTMLparser.c +++ b/contrib/libs/libxml/HTMLparser.c @@ -1,5 +1,14 @@ /* - * HTMLparser.c : an HTML 4.0 non-verifying parser + * HTMLparser.c : an HTML parser + * + * References: + * HTML Living Standard + * https://html.spec.whatwg.org/multipage/parsing.html + * + * Tokenization now conforms to HTML5. Tree construction still follows + * a custom, non-standard implementation. See: + * + * https://gitlab.gnome.org/GNOME/libxml2/-/issues/211 * * See Copyright for the status of this software. * @@ -11,54 +20,91 @@ #ifdef LIBXML_HTML_ENABLED #include -#ifdef HAVE_CTYPE_H #include -#endif -#ifdef HAVE_STDLIB_H #include -#endif -#ifdef HAVE_SYS_STAT_H -#include -#endif -#ifdef HAVE_FCNTL_H -#include -#endif -#ifdef HAVE_UNISTD_H -#include -#endif -#ifdef LIBXML_ZLIB_ENABLED -#include -#endif +#include #include #include #include #include #include -#include #include #include #include -#include #include -#include #include -#include "buf.h" -#include "enc.h" +#include "private/buf.h" +#include "private/dict.h" +#include "private/enc.h" +#include "private/error.h" +#include "private/html.h" +#include "private/io.h" +#include "private/memory.h" +#include "private/parser.h" +#include "private/tree.h" #define HTML_MAX_NAMELEN 1000 +#define HTML_MAX_ATTRS 100000000 /* 100 million */ #define HTML_PARSER_BIG_BUFFER_SIZE 1000 #define HTML_PARSER_BUFFER_SIZE 100 -/* #define DEBUG */ -/* #define DEBUG_PUSH */ +#define IS_WS_HTML(c) \ + (((c) == 0x20) || \ + (((c) >= 0x09) && ((c) <= 0x0D) && ((c) != 0x0B))) + +#define IS_HEX_DIGIT(c) \ + ((IS_ASCII_DIGIT(c)) || \ + ((((c) | 0x20) >= 'a') && (((c) | 0x20) <= 'f'))) + +#define IS_UPPER(c) \ + (((c) >= 'A') && ((c) <= 'Z')) + +#define IS_ALNUM(c) \ + (IS_ASCII_LETTER(c) || IS_ASCII_DIGIT(c)) + +typedef enum { + INSERT_INITIAL = 1, + INSERT_IN_HEAD = 3, + INSERT_IN_BODY = 10 +} htmlInsertMode; + +typedef const unsigned htmlAsciiMask[2]; + +static htmlAsciiMask MASK_DQ = { + 0, + 1u << ('"' - 32), +}; +static htmlAsciiMask MASK_SQ = { + 0, + 1u << ('\'' - 32), +}; +static htmlAsciiMask MASK_GT = { + 0, + 1u << ('>' - 32), +}; +static htmlAsciiMask MASK_DASH = { + 0, + 1u << ('-' - 32), +}; +static htmlAsciiMask MASK_WS_GT = { + 1u << 0x09 | 1u << 0x0A | 1u << 0x0C | 1u << 0x0D, + 1u << (' ' - 32) | 1u << ('>' - 32), +}; +static htmlAsciiMask MASK_DQ_GT = { + 0, + 1u << ('"' - 32) | 1u << ('>' - 32), +}; +static htmlAsciiMask MASK_SQ_GT = { + 0, + 1u << ('\'' - 32) | 1u << ('>' - 32), +}; static int htmlOmittedDefaultValue = 1; -xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, - xmlChar end, xmlChar end2, xmlChar end3); -static void htmlParseComment(htmlParserCtxtPtr ctxt); +static int +htmlParseElementInternal(htmlParserCtxtPtr ctxt); /************************************************************************ * * @@ -74,25 +120,9 @@ static void htmlParseComment(htmlParserCtxtPtr ctxt); * Handle a redefinition of attribute error */ static void -htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) +htmlErrMemory(xmlParserCtxtPtr ctxt) { - if ((ctxt != NULL) && (ctxt->disableSAX != 0) && - (ctxt->instate == XML_PARSER_EOF)) - return; - if (ctxt != NULL) { - ctxt->errNo = XML_ERR_NO_MEMORY; - ctxt->instate = XML_PARSER_EOF; - ctxt->disableSAX = 1; - } - if (extra) - __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, - XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, - NULL, NULL, 0, 0, - "Memory allocation failed : %s\n", extra); - else - __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, - XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, - NULL, NULL, 0, 0, "Memory allocation failed\n"); + xmlCtxtErrMemory(ctxt); } /** @@ -109,43 +139,8 @@ static void LIBXML_ATTR_FORMAT(3,0) htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, const char *msg, const xmlChar *str1, const xmlChar *str2) { - if ((ctxt != NULL) && (ctxt->disableSAX != 0) && - (ctxt->instate == XML_PARSER_EOF)) - return; - if (ctxt != NULL) - ctxt->errNo = error; - __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, - XML_ERR_ERROR, NULL, 0, - (const char *) str1, (const char *) str2, - NULL, 0, 0, - msg, str1, str2); - if (ctxt != NULL) - ctxt->wellFormed = 0; -} - -/** - * htmlParseErrInt: - * @ctxt: an HTML parser context - * @error: the error number - * @msg: the error message - * @val: integer info - * - * Handle a fatal parser error, i.e. violating Well-Formedness constraints - */ -static void LIBXML_ATTR_FORMAT(3,0) -htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, - const char *msg, int val) -{ - if ((ctxt != NULL) && (ctxt->disableSAX != 0) && - (ctxt->instate == XML_PARSER_EOF)) - return; - if (ctxt != NULL) - ctxt->errNo = error; - __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, - XML_ERR_ERROR, NULL, 0, NULL, NULL, - NULL, val, 0, msg, val); - if (ctxt != NULL) - ctxt->wellFormed = 0; + xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR, + str1, str2, NULL, 0, msg, str1, str2); } /************************************************************************ @@ -161,25 +156,32 @@ htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, * * Pushes a new element name on top of the name stack * - * Returns 0 in case of error, the index in the stack otherwise + * Returns -1 in case of error, the index in the stack otherwise */ static int htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) { - if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head"))) - ctxt->html = 3; - if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body"))) - ctxt->html = 10; + if ((ctxt->html < INSERT_IN_HEAD) && (xmlStrEqual(value, BAD_CAST "head"))) + ctxt->html = INSERT_IN_HEAD; + if ((ctxt->html < INSERT_IN_BODY) && (xmlStrEqual(value, BAD_CAST "body"))) + ctxt->html = INSERT_IN_BODY; if (ctxt->nameNr >= ctxt->nameMax) { - ctxt->nameMax *= 2; - ctxt->nameTab = (const xmlChar * *) - xmlRealloc((xmlChar * *)ctxt->nameTab, - ctxt->nameMax * - sizeof(ctxt->nameTab[0])); - if (ctxt->nameTab == NULL) { - htmlErrMemory(ctxt, NULL); - return (0); + const xmlChar **tmp; + int newSize; + + newSize = xmlGrowCapacity(ctxt->nameMax, sizeof(tmp[0]), + 10, XML_MAX_ITEMS); + if (newSize < 0) { + htmlErrMemory(ctxt); + return (-1); + } + tmp = xmlRealloc(ctxt->nameTab, newSize * sizeof(tmp[0])); + if (tmp == NULL) { + htmlErrMemory(ctxt); + return(-1); } + ctxt->nameTab = tmp; + ctxt->nameMax = newSize; } ctxt->nameTab[ctxt->nameNr] = value; ctxt->name = value; @@ -225,17 +227,22 @@ static int htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value) { if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) { - if (ctxt->nodeInfoMax == 0) - ctxt->nodeInfoMax = 5; - ctxt->nodeInfoMax *= 2; - ctxt->nodeInfoTab = (htmlParserNodeInfo *) - xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab, - ctxt->nodeInfoMax * - sizeof(ctxt->nodeInfoTab[0])); - if (ctxt->nodeInfoTab == NULL) { - htmlErrMemory(ctxt, NULL); + xmlParserNodeInfo *tmp; + int newSize; + + newSize = xmlGrowCapacity(ctxt->nodeInfoMax, sizeof(tmp[0]), + 5, XML_MAX_ITEMS); + if (newSize < 0) { + htmlErrMemory(ctxt); + return (0); + } + tmp = xmlRealloc(ctxt->nodeInfoTab, newSize * sizeof(tmp[0])); + if (tmp == NULL) { + htmlErrMemory(ctxt); return (0); } + ctxt->nodeInfoTab = tmp; + ctxt->nodeInfoMax = newSize; } ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value; ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; @@ -286,11 +293,6 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt) * * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding * - * CURRENT Returns the current char value, with the full decoding of - * UTF-8 if we are using this mode. It returns an int. - * NEXT Skip to the next character, this does the proper decoding - * in UTF-8 mode. It also pop-up unfinished entities on the fly. - * NEXTL(l) Skip the current unicode character of l xmlChars long. * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly */ @@ -305,46 +307,22 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt) #define CUR_PTR ctxt->input->cur #define BASE_PTR ctxt->input->base -#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ - (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ - xmlParserInputShrink(ctxt->input) - -#define GROW if ((ctxt->progressive == 0) && \ - (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ - xmlParserInputGrow(ctxt->input, INPUT_CHUNK) +#define SHRINK \ + if ((!PARSER_PROGRESSIVE(ctxt)) && \ + (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ + (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ + xmlParserShrink(ctxt); -#define CURRENT ((int) (*ctxt->input->cur)) +#define GROW \ + if ((!PARSER_PROGRESSIVE(ctxt)) && \ + (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ + xmlParserGrow(ctxt); #define SKIP_BLANKS htmlSkipBlankChars(ctxt) /* Imported from XML */ -/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ -#define CUR ((int) (*ctxt->input->cur)) -#define NEXT xmlNextChar(ctxt) - -#define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) - - -#define NEXTL(l) do { \ - if (*(ctxt->input->cur) == '\n') { \ - ctxt->input->line++; ctxt->input->col = 1; \ - } else ctxt->input->col++; \ - ctxt->token = 0; ctxt->input->cur += l; \ - } while (0) - -/************ - \ - if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ - if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); - ************/ - -#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) -#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) - -#define COPY_BUF(l,b,i,v) \ - if (l == 1) b[i++] = (xmlChar) v; \ - else i += xmlCopyChar(l,&b[i],v) +#define CUR (*ctxt->input->cur) /** * htmlFindEncoding: @@ -363,10 +341,10 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt) static xmlChar * htmlFindEncoding(xmlParserCtxtPtr ctxt) { const xmlChar *start, *cur, *end; + xmlChar *ret; if ((ctxt == NULL) || (ctxt->input == NULL) || - (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || - (ctxt->input->buf->encoder != NULL)) + (ctxt->input->flags & XML_INPUT_HAS_ENCODING)) return(NULL); if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) return(NULL); @@ -388,205 +366,83 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) { return(NULL); cur += 8; start = cur; - while (((*cur >= 'A') && (*cur <= 'Z')) || - ((*cur >= 'a') && (*cur <= 'z')) || - ((*cur >= '0') && (*cur <= '9')) || + while ((IS_ALNUM(*cur)) || (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/')) cur++; if (cur == start) return(NULL); - return(xmlStrndup(start, cur - start)); + ret = xmlStrndup(start, cur - start); + if (ret == NULL) + htmlErrMemory(ctxt); + return(ret); } -/** - * htmlCurrentChar: - * @ctxt: the HTML parser context - * @len: pointer to the length of the char read - * - * The current char value, if using UTF-8 this may actually span multiple - * bytes in the input buffer. Implement the end of line normalization: - * 2.11 End-of-Line Handling - * If the encoding is unspecified, in the case we find an ISO-Latin-1 - * char, then the encoding converter is plugged in automatically. - * - * Returns the current char value and its length - */ +static int +htmlMaskMatch(htmlAsciiMask mask, unsigned c) { + if (c >= 64) + return(0); + return((mask[c/32] >> (c & 31)) & 1); +} static int -htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { - const unsigned char *cur; - unsigned char c; - unsigned int val; +htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len, + int partial) { + unsigned c = str[0]; + int size; + + if (c < 0xC2) { + goto invalid; + } else if (c < 0xE0) { + if (len < 2) + goto incomplete; + if ((str[1] & 0xC0) != 0x80) + goto invalid; + size = 2; + } else if (c < 0xF0) { + unsigned v; + + if (len < 3) + goto incomplete; + + v = str[1] << 8 | str[2]; /* hint to generate 16-bit load */ + v |= c << 16; + + if (((v & 0x00C0C0) != 0x008080) || + ((v & 0x0F2000) == 0x000000) || + ((v & 0x0F2000) == 0x0D2000)) + goto invalid; + + size = 3; + } else { + unsigned v; - if (ctxt->instate == XML_PARSER_EOF) - return(0); + if (len < 4) + goto incomplete; - if (ctxt->token != 0) { - *len = 0; - return(ctxt->token); - } - if (ctxt->charset != XML_CHAR_ENCODING_UTF8) { - xmlChar * guess; - xmlCharEncodingHandlerPtr handler; + v = c << 24 | str[1] << 16 | str[2] << 8 | str[3]; - /* - * Assume it's a fixed length encoding (1) with - * a compatible encoding for the ASCII set, since - * HTML constructs only use < 128 chars - */ - if ((int) *ctxt->input->cur < 0x80) { - *len = 1; - if ((*ctxt->input->cur == 0) && - (ctxt->input->cur < ctxt->input->end)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Char 0x%X out of allowed range\n", 0); - return(' '); - } - return((int) *ctxt->input->cur); - } + if (((v & 0x00C0C0C0) != 0x00808080) || + (v < 0xF0900000) || (v >= 0xF4900000)) + goto invalid; - /* - * Humm this is bad, do an automatic flow conversion - */ - guess = htmlFindEncoding(ctxt); - if (guess == NULL) { - xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); - } else { - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = guess; - handler = xmlFindCharEncodingHandler((const char *) guess); - if (handler != NULL) { - /* - * Don't use UTF-8 encoder which isn't required and - * can produce invalid UTF-8. - */ - if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8")) - xmlSwitchToEncoding(ctxt, handler); - } else { - htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, - "Unsupported encoding %s", guess, NULL); - } - } - ctxt->charset = XML_CHAR_ENCODING_UTF8; + size = 4; } - /* - * We are supposed to handle UTF8, check it's valid - * From rfc2044: encoding of the Unicode values on UTF-8: - * - * UCS-4 range (hex.) UTF-8 octet sequence (binary) - * 0000 0000-0000 007F 0xxxxxxx - * 0000 0080-0000 07FF 110xxxxx 10xxxxxx - * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx - * - * Check for the 0x110000 limit too - */ - cur = ctxt->input->cur; - c = *cur; - if (c & 0x80) { - if ((c & 0x40) == 0) - goto encoding_error; - if (cur[1] == 0) { - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - cur = ctxt->input->cur; - } - if ((cur[1] & 0xc0) != 0x80) - goto encoding_error; - if ((c & 0xe0) == 0xe0) { - - if (cur[2] == 0) { - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - cur = ctxt->input->cur; - } - if ((cur[2] & 0xc0) != 0x80) - goto encoding_error; - if ((c & 0xf0) == 0xf0) { - if (cur[3] == 0) { - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - cur = ctxt->input->cur; - } - if (((c & 0xf8) != 0xf0) || - ((cur[3] & 0xc0) != 0x80)) - goto encoding_error; - /* 4-byte code */ - *len = 4; - val = (cur[0] & 0x7) << 18; - val |= (cur[1] & 0x3f) << 12; - val |= (cur[2] & 0x3f) << 6; - val |= cur[3] & 0x3f; - if (val < 0x10000) - goto encoding_error; - } else { - /* 3-byte code */ - *len = 3; - val = (cur[0] & 0xf) << 12; - val |= (cur[1] & 0x3f) << 6; - val |= cur[2] & 0x3f; - if (val < 0x800) - goto encoding_error; - } - } else { - /* 2-byte code */ - *len = 2; - val = (cur[0] & 0x1f) << 6; - val |= cur[1] & 0x3f; - if (val < 0x80) - goto encoding_error; - } - if (!IS_CHAR(val)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Char 0x%X out of allowed range\n", val); - } - return(val); - } else { - if ((*ctxt->input->cur == 0) && - (ctxt->input->cur < ctxt->input->end)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Char 0x%X out of allowed range\n", 0); - *len = 1; - return(' '); - } - /* 1-byte code */ - *len = 1; - return((int) *ctxt->input->cur); - } + return(size); -encoding_error: - /* - * If we detect an UTF8 error that probably mean that the - * input encoding didn't get properly advertised in the - * declaration header. Report the error and switch the encoding - * to ISO-Latin-1 (if you don't like this policy, just declare the - * encoding !) - */ - { - char buffer[150]; +incomplete: + if (partial) + return(0); - if (ctxt->input->end - ctxt->input->cur >= 4) { - snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", - ctxt->input->cur[0], ctxt->input->cur[1], - ctxt->input->cur[2], ctxt->input->cur[3]); - } else { - snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]); - } - htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, - "Input is not proper UTF-8, indicate encoding !\n", - BAD_CAST buffer, NULL); +invalid: + /* Only report the first error */ + if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) { + htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, + "Invalid bytes in character encoding", NULL, NULL); + ctxt->input->flags |= XML_INPUT_ENCODING_ERROR; } - /* - * Don't switch encodings twice. Note that if there's an encoder, we - * shouldn't receive invalid UTF-8 anyway. - * - * Note that if ctxt->input->buf == NULL, switching encodings is - * impossible, see Gitlab issue #34. - */ - if ((ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder == NULL)) - xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); - *len = 1; - return((int) *ctxt->input->cur); + return(-1); } /** @@ -600,22 +456,46 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { static int htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { + const xmlChar *cur = ctxt->input->cur; + size_t avail = ctxt->input->end - cur; int res = 0; + int line = ctxt->input->line; + int col = ctxt->input->col; - while (IS_BLANK_CH(*(ctxt->input->cur))) { - if ((*ctxt->input->cur == 0) && - (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { - xmlPopInput(ctxt); - } else { - if (*(ctxt->input->cur) == '\n') { - ctxt->input->line++; ctxt->input->col = 1; - } else ctxt->input->col++; - ctxt->input->cur++; - if (*ctxt->input->cur == 0) - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - } - res++; + while (!PARSER_STOPPED(ctxt)) { + if (avail == 0) { + ctxt->input->cur = cur; + GROW; + cur = ctxt->input->cur; + avail = ctxt->input->end - cur; + + if (avail == 0) + break; + } + + if (*cur == '\n') { + line++; + col = 1; + } else if (IS_WS_HTML(*cur)) { + col++; + } else { + break; + } + + cur += 1; + avail -= 1; + + if (res < INT_MAX) + res++; } + + ctxt->input->cur = cur; + ctxt->input->line = line; + ctxt->input->col = col; + + if (res > 8) + GROW; + return(res); } @@ -637,438 +517,396 @@ htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { * 2 means that this element is valid only in the Frameset DTD * * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description - , subElements , impliedsubelt , Attributes, userdata */ -/* Definitions and a couple of vars for HTML Elements */ - -#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" -#define NB_FONTSTYLE 8 -#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" -#define NB_PHRASE 10 -#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" -#define NB_SPECIAL 16 -#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL -#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL -#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" -#define NB_BLOCK NB_HEADING + NB_LIST + 14 -#define FORMCTRL "input", "select", "textarea", "label", "button" -#define NB_FORMCTRL 5 -#define PCDATA -#define NB_PCDATA 0 -#define HEADING "h1", "h2", "h3", "h4", "h5", "h6" -#define NB_HEADING 6 -#define LIST "ul", "ol", "dir", "menu" -#define NB_LIST 4 -#define MODIFIER -#define NB_MODIFIER 0 -#define FLOW BLOCK,INLINE -#define NB_FLOW NB_BLOCK + NB_INLINE -#define EMPTY NULL - - -static const char* const html_flow[] = { FLOW, NULL } ; -static const char* const html_inline[] = { INLINE, NULL } ; - -/* placeholders: elts with content but no subelements */ -static const char* const html_pcdata[] = { NULL } ; -#define html_cdata html_pcdata - - -/* ... and for HTML Attributes */ - -#define COREATTRS "id", "class", "style", "title" -#define NB_COREATTRS 4 -#define I18N "lang", "dir" -#define NB_I18N 2 -#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" -#define NB_EVENTS 9 -#define ATTRS COREATTRS,I18N,EVENTS -#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS -#define CELLHALIGN "align", "char", "charoff" -#define NB_CELLHALIGN 3 -#define CELLVALIGN "valign" -#define NB_CELLVALIGN 1 - -static const char* const html_attrs[] = { ATTRS, NULL } ; -static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; -static const char* const core_attrs[] = { COREATTRS, NULL } ; -static const char* const i18n_attrs[] = { I18N, NULL } ; - - -/* Other declarations that should go inline ... */ -static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", - "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", - "tabindex", "onfocus", "onblur", NULL } ; -static const char* const target_attr[] = { "target", NULL } ; -static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; -static const char* const alt_attr[] = { "alt", NULL } ; -static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; -static const char* const href_attrs[] = { "href", NULL } ; -static const char* const clear_attrs[] = { "clear", NULL } ; -static const char* const inline_p[] = { INLINE, "p", NULL } ; - -static const char* const flow_param[] = { FLOW, "param", NULL } ; -static const char* const applet_attrs[] = { COREATTRS , "codebase", - "archive", "alt", "name", "height", "width", "align", - "hspace", "vspace", NULL } ; -static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", - "tabindex", "accesskey", "onfocus", "onblur", NULL } ; -static const char* const basefont_attrs[] = - { "id", "size", "color", "face", NULL } ; -static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; -static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; -static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; -static const char* const body_depr[] = { "background", "bgcolor", "text", - "link", "vlink", "alink", NULL } ; -static const char* const button_attrs[] = { ATTRS, "name", "value", "type", - "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; - - -static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; -static const char* const col_elt[] = { "col", NULL } ; -static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; -static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; -static const char* const dl_contents[] = { "dt", "dd", NULL } ; -static const char* const compact_attr[] = { "compact", NULL } ; -static const char* const label_attr[] = { "label", NULL } ; -static const char* const fieldset_contents[] = { FLOW, "legend" } ; -static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; -static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; -static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; -static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; -static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; -static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; -static const char* const head_attrs[] = { I18N, "profile", NULL } ; -static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; -static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; -static const char* const version_attr[] = { "version", NULL } ; -static const char* const html_content[] = { "head", "body", "frameset", NULL } ; -static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; -static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; -static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ; -static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; -static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; -static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; -static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; -static const char* const align_attr[] = { "align", NULL } ; -static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; -static const char* const map_contents[] = { BLOCK, "area", NULL } ; -static const char* const name_attr[] = { "name", NULL } ; -static const char* const action_attr[] = { "action", NULL } ; -static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; -static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ; -static const char* const content_attr[] = { "content", NULL } ; -static const char* const type_attr[] = { "type", NULL } ; -static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; -static const char* const object_contents[] = { FLOW, "param", NULL } ; -static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; -static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; -static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; -static const char* const option_elt[] = { "option", NULL } ; -static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; -static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; -static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; -static const char* const width_attr[] = { "width", NULL } ; -static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; -static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; -static const char* const language_attr[] = { "language", NULL } ; -static const char* const select_content[] = { "optgroup", "option", NULL } ; -static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; -static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; -static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; -static const char* const table_depr[] = { "align", "bgcolor", NULL } ; -static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; -static const char* const tr_elt[] = { "tr", NULL } ; -static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; -static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; -static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; -static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; -static const char* const tr_contents[] = { "th", "td", NULL } ; -static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; -static const char* const li_elt[] = { "li", NULL } ; -static const char* const ul_depr[] = { "type", "compact", NULL} ; -static const char* const dir_attr[] = { "dir", NULL} ; - -#define DECL (const char**) +#define DATA_RCDATA 1 +#define DATA_RAWTEXT 2 +#define DATA_PLAINTEXT 3 +#define DATA_SCRIPT 4 +#define DATA_SCRIPT_ESC1 5 +#define DATA_SCRIPT_ESC2 6 static const htmlElemDesc html40ElementTable[] = { { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", - DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", - DECL html_inline , NULL , DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "acronym", 0, 0, 0, 0, 0, 0, 1, "", - DECL html_inline , NULL , DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", - DECL inline_p , NULL , DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", - DECL flow_param , NULL , NULL , DECL applet_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", - EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr + NULL, NULL, NULL, NULL, NULL, + 0 }, { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", - DECL html_inline , NULL , DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", - EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs + NULL, NULL, NULL, NULL, NULL, + 0 }, { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , - EMPTY , NULL , NULL, DECL basefont_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", - DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr + NULL, NULL, NULL, NULL, NULL, + 0 }, { "big", 0, 3, 0, 0, 0, 0, 1, "large text style", - DECL html_inline , NULL , DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", - DECL html_flow , NULL , DECL quote_attrs , NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "body", 1, 1, 0, 0, 0, 0, 0, "document body ", - DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", - EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "button", 0, 0, 0, 0, 0, 0, 2, "push button ", - DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", - DECL html_inline , NULL , DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", - DECL html_flow , NULL , NULL, DECL html_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "cite", 0, 0, 0, 0, 0, 0, 1, "citation", - DECL html_inline , NULL , DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", - DECL html_inline , NULL , DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "col", 0, 2, 2, 1, 0, 0, 0, "table column ", - EMPTY , NULL , DECL col_attrs , NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", - DECL col_elt , "col" , DECL col_attrs , NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", - DECL html_flow , NULL , DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", - DECL html_flow , NULL , DECL edit_attrs , NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", - DECL html_inline , NULL , DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", - DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", - DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", - DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", - DECL html_inline, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", - DECL html_inline, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ", - EMPTY, NULL, DECL embed_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", - DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", - DECL html_inline, NULL, NULL, DECL font_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", - DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr + NULL, NULL, NULL, NULL, NULL, + 0 }, { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , - EMPTY, NULL, NULL, DECL frame_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , - DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", - DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", - DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", - DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", - DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", - DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", - DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "head", 1, 1, 0, 0, 0, 0, 0, "document head ", - DECL head_contents, NULL, DECL head_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , - EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", - DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", - DECL html_inline, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", - DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + DATA_RAWTEXT }, { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", - EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs + NULL, NULL, NULL, NULL, NULL, + 0 }, { "input", 0, 2, 2, 1, 0, 0, 1, "form control ", - EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", - DECL html_flow, NULL, DECL edit_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", - EMPTY, NULL, NULL, DECL prompt_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", - DECL html_inline, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", - DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", - DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "li", 0, 1, 1, 0, 0, 0, 0, "list item ", - DECL html_flow, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", - EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", - DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr + NULL, NULL, NULL, NULL, NULL, + 0 }, { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", - DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", - EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr + NULL, NULL, NULL, NULL, NULL, + 0 +}, +{ "noembed", 0, 0, 0, 0, 0, 0, 0, "", + NULL, NULL, NULL, NULL, NULL, + DATA_RAWTEXT }, { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", - DECL noframes_content, "body" , DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + DATA_RAWTEXT }, { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", - DECL html_flow, "div", DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", - DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", - DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", - DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr + NULL, NULL, NULL, NULL, NULL, + 0 }, { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , - DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", - DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", - EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr + NULL, NULL, NULL, NULL, NULL, + 0 +}, +{ "plaintext", 0, 0, 0, 0, 0, 0, 0, "", + NULL, NULL, NULL, NULL, NULL, + DATA_PLAINTEXT }, { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", - DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", - DECL html_inline, NULL, DECL quote_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", - DECL html_inline, NULL, NULL, DECL html_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", - DECL html_inline, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", - DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr + NULL, NULL, NULL, NULL, NULL, + DATA_SCRIPT }, { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", - DECL select_content, NULL, DECL select_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "small", 0, 3, 0, 0, 0, 0, 1, "small text style", - DECL html_inline, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", - DECL html_inline, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", - DECL html_inline, NULL, NULL, DECL html_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", - DECL html_inline, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "style", 0, 0, 0, 0, 0, 0, 0, "style info ", - DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr + NULL, NULL, NULL, NULL, NULL, + DATA_RAWTEXT }, { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", - DECL html_inline, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", - DECL html_inline, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "table", 0, 0, 0, 0, 0, 0, 0, "", - DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", - DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", - DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", - DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr + NULL, NULL, NULL, NULL, NULL, + DATA_RCDATA }, { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", - DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", - DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", - DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "title", 0, 0, 0, 0, 0, 0, 0, "document title ", - DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + DATA_RCDATA }, { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", - DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", - DECL html_inline, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", - DECL html_inline, NULL, NULL, DECL html_attrs, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", - DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL + NULL, NULL, NULL, NULL, NULL, + 0 }, { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", - DECL html_inline, NULL, DECL html_attrs, NULL, NULL + NULL, NULL, NULL, NULL, NULL, + 0 +}, +{ "xmp", 0, 0, 0, 0, 0, 0, 1, "", + NULL, NULL, NULL, NULL, NULL, + DATA_RAWTEXT } }; @@ -1238,7 +1076,6 @@ static const htmlStartCloseEntry htmlStartClose[] = { { "menu", "form" }, { "menu", "ul" }, { "ol", "form" }, - { "ol", "ul" }, { "option", "optgroup" }, { "option", "option" }, { "p", "address" }, @@ -1322,7 +1159,6 @@ static const htmlStartCloseEntry htmlStartClose[] = { { "ul", "address" }, { "ul", "form" }, { "ul", "menu" }, - { "ul", "ol" }, { "ul", "pre" }, { "xmp", "dd" }, { "xmp", "dl" }, @@ -1334,19 +1170,6 @@ static const htmlStartCloseEntry htmlStartClose[] = { { "xmp", "ul" } }; -/* - * The list of HTML elements which are supposed not to have - * CDATA content and where a p element will be implied - * - * TODO: extend that list by reading the HTML SGML DTD on - * implied paragraph - */ -static const char *const htmlNoContentElements[] = { - "html", - "head", - NULL -}; - /* * The list of HTML attributes which are of content %Script; * NOTE: when adding ones, check htmlIsScriptAttribute() since @@ -1407,10 +1230,25 @@ static const elementPriority htmlEndPriority[] = { * * ************************************************************************/ +static void +htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) { + /* + * Capture end position and add node + */ + if ( ctxt->node != NULL && ctxt->record_info ) { + ctxt->nodeInfo->end_pos = ctxt->input->consumed + + (CUR_PTR - ctxt->input->base); + ctxt->nodeInfo->end_line = ctxt->input->line; + ctxt->nodeInfo->node = ctxt->node; + xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo); + htmlNodeInfoPop(ctxt); + } +} + /** * htmlInitAutoClose: * - * This is a no-op now. + * DEPRECATED: This is a no-op. */ void htmlInitAutoClose(void) { @@ -1511,6 +1349,9 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) const htmlElemDesc *info; int i, priority; + if (ctxt->options & HTML_PARSE_HTML5) + return; + priority = htmlGetEndPriority(newtag); for (i = (ctxt->nameNr - 1); i >= 0; i--) { @@ -1536,6 +1377,7 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) "Opening and ending tag mismatch: %s and %s\n", newtag, ctxt->name); } + htmlParserFinishElementParsing(ctxt); if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) ctxt->sax->endElement(ctxt->userData, ctxt->name); htmlnamePop(ctxt); @@ -1553,9 +1395,13 @@ htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) { int i; + if (ctxt->options & HTML_PARSE_HTML5) + return; + if (ctxt->nameNr == 0) return; for (i = (ctxt->nameNr - 1); i >= 0; i--) { + htmlParserFinishElementParsing(ctxt); if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) ctxt->sax->endElement(ctxt->userData, ctxt->name); htmlnamePop(ctxt); @@ -1577,20 +1423,15 @@ htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) static void htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) { - while ((newtag != NULL) && (ctxt->name != NULL) && - (htmlCheckAutoClose(newtag, ctxt->name))) { - if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) - ctxt->sax->endElement(ctxt->userData, ctxt->name); - htmlnamePop(ctxt); - } - if (newtag == NULL) { - htmlAutoCloseOnEnd(ctxt); + if (ctxt->options & HTML_PARSE_HTML5) return; - } - while ((newtag == NULL) && (ctxt->name != NULL) && - ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || - (xmlStrEqual(ctxt->name, BAD_CAST "body")) || - (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { + + if (newtag == NULL) + return; + + while ((ctxt->name != NULL) && + (htmlCheckAutoClose(newtag, ctxt->name))) { + htmlParserFinishElementParsing(ctxt); if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) ctxt->sax->endElement(ctxt->userData, ctxt->name); htmlnamePop(ctxt); @@ -1603,6 +1444,8 @@ htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) * @name: The tag name * @elem: the HTML element * + * DEPRECATED: Internal function, don't use. + * * The HTML DTD allows a tag to implicitly close other tags. * The list is kept in htmlStartClose array. This function checks * if the element or one of it's children would autoclose the @@ -1630,6 +1473,8 @@ htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { * @doc: the HTML document * @elem: the HTML element * + * DEPRECATED: Internal function, don't use. + * * The HTML DTD allows a tag to implicitly close other tags. * The list is kept in htmlStartClose array. This function checks * if a tag is autoclosed by one of it's child @@ -1662,7 +1507,7 @@ static void htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { int i; - if (ctxt->options & HTML_PARSE_NOIMPLIED) + if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5)) return; if (!htmlOmittedDefaultValue) return; @@ -1682,7 +1527,7 @@ htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { (xmlStrEqual(newtag, BAD_CAST"link")) || (xmlStrEqual(newtag, BAD_CAST"title")) || (xmlStrEqual(newtag, BAD_CAST"base")))) { - if (ctxt->html >= 3) { + if (ctxt->html >= INSERT_IN_HEAD) { /* we already saw or generated an before */ return; } @@ -1696,7 +1541,7 @@ htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && (!xmlStrEqual(newtag, BAD_CAST"frame")) && (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { - if (ctxt->html >= 10) { + if (ctxt->html >= INSERT_IN_BODY) { /* we already saw or generated a before */ return; } @@ -1716,45 +1561,22 @@ htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { } /** - * htmlCheckParagraph + * htmlStartCharData * @ctxt: an HTML parser context * - * Check whether a p element need to be implied before inserting - * characters in the current element. - * - * Returns 1 if a paragraph has been inserted, 0 if not and -1 - * in case of error. + * Prepare for non-whitespace character data. */ -static int -htmlCheckParagraph(htmlParserCtxtPtr ctxt) { - const xmlChar *tag; - int i; - - if (ctxt == NULL) - return(-1); - tag = ctxt->name; - if (tag == NULL) { - htmlAutoClose(ctxt, BAD_CAST"p"); - htmlCheckImplied(ctxt, BAD_CAST"p"); - htmlnamePush(ctxt, BAD_CAST"p"); - if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) - ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); - return(1); - } +static void +htmlStartCharData(htmlParserCtxtPtr ctxt) { + if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5)) + return; if (!htmlOmittedDefaultValue) - return(0); - for (i = 0; htmlNoContentElements[i] != NULL; i++) { - if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { - htmlAutoClose(ctxt, BAD_CAST"p"); - htmlCheckImplied(ctxt, BAD_CAST"p"); - htmlnamePush(ctxt, BAD_CAST"p"); - if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) - ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); - return(1); - } - } - return(0); + return; + + if (xmlStrEqual(ctxt->name, BAD_CAST "head")) + htmlAutoClose(ctxt, BAD_CAST "p"); + htmlCheckImplied(ctxt, BAD_CAST "p"); } /** @@ -2081,21 +1903,6 @@ static const htmlEntityDesc html40EntitiesTable[] = { * * ************************************************************************/ -/* - * Macro used to grow the current buffer. - */ -#define growBuffer(buffer) { \ - xmlChar *tmp; \ - buffer##_size *= 2; \ - tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ - if (tmp == NULL) { \ - htmlErrMemory(ctxt, "growing buffer\n"); \ - xmlFree(buffer); \ - return(NULL); \ - } \ - buffer = tmp; \ -} - /** * htmlEntityLookup: * @name: the entity name @@ -2119,6 +1926,14 @@ htmlEntityLookup(const xmlChar *name) { return(NULL); } +static int +htmlCompareEntityDesc(const void *vkey, const void *vdesc) { + const unsigned *key = vkey; + const htmlEntityDesc *desc = vdesc; + + return((int) *key - (int) desc->value); +} + /** * htmlEntityValueLookup: * @value: the entity's unicode value @@ -2131,21 +1946,18 @@ htmlEntityLookup(const xmlChar *name) { */ const htmlEntityDesc * htmlEntityValueLookup(unsigned int value) { - unsigned int i; + const htmlEntityDesc *desc; + size_t nmemb; - for (i = 0;i < (sizeof(html40EntitiesTable)/ - sizeof(html40EntitiesTable[0]));i++) { - if (html40EntitiesTable[i].value >= value) { - if (html40EntitiesTable[i].value > value) - break; - return((htmlEntityDescPtr) &html40EntitiesTable[i]); - } - } - return(NULL); + nmemb = sizeof(html40EntitiesTable) / sizeof(html40EntitiesTable[0]); + desc = bsearch(&value, html40EntitiesTable, nmemb, sizeof(htmlEntityDesc), + htmlCompareEntityDesc); + + return(desc); } /** - * UTF8ToHtml: + * htmlUTF8ToHtml: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of UTF-8 chars @@ -2160,91 +1972,88 @@ htmlEntityValueLookup(unsigned int value) { * The value of @outlen after return is the number of octets consumed. */ int -UTF8ToHtml(unsigned char* out, int *outlen, - const unsigned char* in, int *inlen) { - const unsigned char* processed = in; - const unsigned char* outend; - const unsigned char* outstart = out; +htmlUTF8ToHtml(unsigned char* out, int *outlen, + const unsigned char* in, int *inlen) { const unsigned char* instart = in; const unsigned char* inend; - unsigned int c, d; - int trailing; + unsigned char* outstart = out; + unsigned char* outend; + int ret = XML_ENC_ERR_SPACE; + + if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) + return(XML_ENC_ERR_INTERNAL); - if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); if (in == NULL) { /* * initialization nothing to do */ *outlen = 0; *inlen = 0; - return(0); + return(XML_ENC_ERR_SUCCESS); } - inend = in + (*inlen); - outend = out + (*outlen); + + inend = in + *inlen; + outend = out + *outlen; while (in < inend) { - d = *in++; - if (d < 0x80) { c= d; trailing= 0; } - else if (d < 0xC0) { - /* trailing byte in leading position */ - *outlen = out - outstart; - *inlen = processed - instart; - return(-2); - } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } - else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } - else if (d < 0xF8) { c= d & 0x07; trailing= 3; } - else { - /* no chance for this in Ascii */ - *outlen = out - outstart; - *inlen = processed - instart; - return(-2); - } + const htmlEntityDesc *ent; + const char *cp; + char nbuf[16]; + unsigned c, d; + int seqlen, len, i; + + d = *in; + + if (d < 0x80) { + if (out >= outend) + goto done; + *out++ = d; + in += 1; + continue; + } + + if (d < 0xE0) { c = d & 0x1F; seqlen = 2; } + else if (d < 0xF0) { c = d & 0x0F; seqlen = 3; } + else { c = d & 0x07; seqlen = 4; } - if (inend - in < trailing) { + if (inend - in < seqlen) break; - } - for ( ; trailing; trailing--) { - if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) - break; + for (i = 1; i < seqlen; i++) { + d = in[i]; c <<= 6; c |= d & 0x3F; } - /* assertion: c is a single UTF-4 value */ - if (c < 0x80) { - if (out + 1 >= outend) - break; - *out++ = c; - } else { - int len; - const htmlEntityDesc * ent; - const char *cp; - char nbuf[16]; + /* + * Try to lookup a predefined HTML entity for it + */ + ent = htmlEntityValueLookup(c); - /* - * Try to lookup a predefined HTML entity for it - */ + if (ent == NULL) { + snprintf(nbuf, sizeof(nbuf), "#%u", c); + cp = nbuf; + } else { + cp = ent->name; + } - ent = htmlEntityValueLookup(c); - if (ent == NULL) { - snprintf(nbuf, sizeof(nbuf), "#%u", c); - cp = nbuf; - } - else - cp = ent->name; - len = strlen(cp); - if (out + 2 + len >= outend) - break; - *out++ = '&'; - memcpy(out, cp, len); - out += len; - *out++ = ';'; - } - processed = in; + len = strlen(cp); + if (outend - out < len + 2) + goto done; + + *out++ = '&'; + memcpy(out, cp, len); + out += len; + *out++ = ';'; + + in += seqlen; } - *outlen = out - outstart; - *inlen = processed - instart; - return(0); + + ret = out - outstart; + +done: + *outlen = out - outstart; + *inlen = in - instart; + return(ret); } /** @@ -2332,7 +2141,7 @@ htmlEncodeEntities(unsigned char* out, int *outlen, else cp = ent->name; len = strlen(cp); - if (out + 2 + len > outend) + if (outend - out < len + 2) break; *out++ = '&'; memcpy(out, cp, len); @@ -2346,47 +2155,6 @@ htmlEncodeEntities(unsigned char* out, int *outlen, return(0); } -/************************************************************************ - * * - * Commodity functions to handle streams * - * * - ************************************************************************/ - -#ifdef LIBXML_PUSH_ENABLED -/** - * htmlNewInputStream: - * @ctxt: an HTML parser context - * - * Create a new input stream structure - * Returns the new input stream or NULL - */ -static htmlParserInputPtr -htmlNewInputStream(htmlParserCtxtPtr ctxt) { - htmlParserInputPtr input; - - input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); - if (input == NULL) { - htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); - return(NULL); - } - memset(input, 0, sizeof(htmlParserInput)); - input->filename = NULL; - input->directory = NULL; - input->base = NULL; - input->cur = NULL; - input->buf = NULL; - input->line = 1; - input->col = 1; - input->buf = NULL; - input->free = NULL; - input->version = NULL; - input->consumed = 0; - input->length = 0; - return(input); -} -#endif - - /************************************************************************ * * * Commodity functions, cleanup needed ? * @@ -2415,7 +2183,7 @@ static const char *allowPCData[] = { * * Is this a sequence of blank chars that one can ignore ? * - * Returns 1 if ignorable 0 otherwise. + * Returns 1 if ignorable 0 if whitespace, -1 otherwise. */ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { @@ -2425,7 +2193,7 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { xmlDtdPtr dtd; for (j = 0;j < len;j++) - if (!(IS_BLANK_CH(str[j]))) return(0); + if (!(IS_WS_HTML(str[j]))) return(-1); if (CUR == 0) return(1); if (CUR != '<') return(0); @@ -2492,10 +2260,8 @@ htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { * Allocate a new document and fill the fields. */ cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); - if (cur == NULL) { - htmlErrMemory(NULL, "HTML document creation failed\n"); + if (cur == NULL) return(NULL); - } memset(cur, 0, sizeof(xmlDoc)); cur->type = XML_HTML_DOCUMENT_NODE; @@ -2515,9 +2281,16 @@ htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { cur->charset = XML_CHAR_ENCODING_UTF8; cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT; if ((ExternalID != NULL) || - (URI != NULL)) - xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); - if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue)) + (URI != NULL)) { + xmlDtdPtr intSubset; + + intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); + if (intSubset == NULL) { + xmlFree(cur); + return(NULL); + } + } + if ((xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue)) xmlRegisterNodeDefaultValue((xmlNodePtr)cur); return(cur); } @@ -2555,8 +2328,6 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { * * ************************************************************************/ -static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); - /** * htmlParseHTMLName: * @ctxt: an HTML parser context @@ -2567,294 +2338,544 @@ static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); * Returns the Tag Name parsed or NULL */ -static const xmlChar * -htmlParseHTMLName(htmlParserCtxtPtr ctxt) { - int i = 0; - xmlChar loc[HTML_PARSER_BUFFER_SIZE]; +static xmlHashedString +htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) { + xmlHashedString ret; + xmlChar buf[HTML_PARSER_BUFFER_SIZE]; + const xmlChar *in; + size_t avail; + int eof = PARSER_PROGRESSIVE(ctxt); + int nbchar = 0; + int stop = attr ? '=' : ' '; - if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && - (CUR != ':') && (CUR != '.')) return(NULL); + in = ctxt->input->cur; + avail = ctxt->input->end - in; - while ((i < HTML_PARSER_BUFFER_SIZE) && - ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || - (CUR == ':') || (CUR == '-') || (CUR == '_') || - (CUR == '.'))) { - if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; - else loc[i] = CUR; - i++; + while (1) { + int c, size; + + if ((!eof) && (avail < 32)) { + size_t oldAvail = avail; + + ctxt->input->cur = in; + + SHRINK; + xmlParserGrow(ctxt); + + in = ctxt->input->cur; + avail = ctxt->input->end - in; + + if (oldAvail == avail) + eof = 1; + } + + if (avail == 0) + break; - NEXT; + c = *in; + size = 1; + + if ((nbchar != 0) && + ((c == '/') || (c == '>') || (c == stop) || + (IS_WS_HTML(c)))) + break; + + if (c == 0) { + if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) { + buf[nbchar++] = 0xEF; + buf[nbchar++] = 0xBF; + buf[nbchar++] = 0xBD; + } + } else if (c < 0x80) { + if (nbchar < HTML_PARSER_BUFFER_SIZE) { + if (IS_UPPER(c)) + c += 0x20; + buf[nbchar++] = c; + } + } else { + size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0); + + if (size > 0) { + if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) { + memcpy(buf + nbchar, in, size); + nbchar += size; + } + } else { + size = 1; + + if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) { + buf[nbchar++] = 0xEF; + buf[nbchar++] = 0xBF; + buf[nbchar++] = 0xBD; + } + } + } + + in += size; + avail -= size; } - return(xmlDictLookup(ctxt->dict, loc, i)); -} + ctxt->input->cur = in; + + SHRINK; + ret = xmlDictLookupHashed(ctxt->dict, buf, nbchar); + if (ret.name == NULL) + htmlErrMemory(ctxt); -/** - * htmlParseHTMLName_nonInvasive: - * @ctxt: an HTML parser context - * - * parse an HTML tag or attribute name, note that we convert it to lowercase - * since HTML names are not case-sensitive, this doesn't consume the data - * from the stream, it's a look-ahead - * - * Returns the Tag Name parsed or NULL - */ + return(ret); +} + +static const short htmlC1Remap[32] = { + 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, + 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 +}; static const xmlChar * -htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { +htmlCodePointToUtf8(int c, xmlChar *out, int *osize) { int i = 0; - xmlChar loc[HTML_PARSER_BUFFER_SIZE]; + int bits, hi; - if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && - (NXT(1) != ':')) return(NULL); + if ((c >= 0x80) && (c < 0xA0)) { + c = htmlC1Remap[c - 0x80]; + } else if ((c <= 0) || + ((c >= 0xD800) && (c < 0xE000)) || + (c > 0x10FFFF)) { + c = 0xFFFD; + } - while ((i < HTML_PARSER_BUFFER_SIZE) && - ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || - (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { - if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; - else loc[i] = NXT(1+i); - i++; + if (c < 0x80) { bits = 0; hi = 0x00; } + else if (c < 0x800) { bits = 6; hi = 0xC0; } + else if (c < 0x10000) { bits = 12; hi = 0xE0; } + else { bits = 18; hi = 0xF0; } + + out[i++] = (c >> bits) | hi; + + while (bits > 0) { + bits -= 6; + out[i++] = ((c >> bits) & 0x3F) | 0x80; } - return(xmlDictLookup(ctxt->dict, loc, i)); + *osize = i; + return(out); } +#include "html5ent.inc" -/** - * htmlParseName: - * @ctxt: an HTML parser context - * - * parse an HTML name, this routine is case sensitive. - * - * Returns the Name parsed or NULL - */ +#define ENT_F_SEMICOLON 0x80u +#define ENT_F_SUBTABLE 0x40u +#define ENT_F_ALL 0xC0u static const xmlChar * -htmlParseName(htmlParserCtxtPtr ctxt) { - const xmlChar *in; - const xmlChar *ret; - int count = 0; - - GROW; +htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr, + int *nlen, int *rlen) { + const xmlChar *match = NULL; + unsigned left, right; + int first = string[0]; + size_t matchLen = 0; + size_t soff = 1; + + if (slen < 2) + return(NULL); + if (!IS_ASCII_LETTER(first)) + return(NULL); /* - * Accelerator for simple ASCII names + * Look up range by first character */ - in = ctxt->input->cur; - if (((*in >= 0x61) && (*in <= 0x7A)) || - ((*in >= 0x41) && (*in <= 0x5A)) || - (*in == '_') || (*in == ':')) { - in++; - while (((*in >= 0x61) && (*in <= 0x7A)) || - ((*in >= 0x41) && (*in <= 0x5A)) || - ((*in >= 0x30) && (*in <= 0x39)) || - (*in == '_') || (*in == '-') || - (*in == ':') || (*in == '.')) - in++; - - if (in == ctxt->input->end) - return(NULL); - - if ((*in > 0) && (*in < 0x80)) { - count = in - ctxt->input->cur; - ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); - ctxt->input->cur = in; - ctxt->input->col += count; - return(ret); - } - } - return(htmlParseNameComplex(ctxt)); -} - -static const xmlChar * -htmlParseNameComplex(xmlParserCtxtPtr ctxt) { - int len = 0, l; - int c; - int count = 0; - const xmlChar *base = ctxt->input->base; + first &= 63; + left = htmlEntAlpha[first*3] | htmlEntAlpha[first*3+1] << 8; + right = left + htmlEntAlpha[first*3+2]; /* - * Handler for more complex cases + * Binary search */ - GROW; - c = CUR_CHAR(l); - if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ - (!IS_LETTER(c) && (c != '_') && - (c != ':'))) { - return(NULL); - } + while (left < right) { + const xmlChar *bytes; + unsigned mid; + size_t len; + int cmp; + + mid = left + (right - left) / 2; + bytes = htmlEntStrings + htmlEntValues[mid]; + len = bytes[0] & ~ENT_F_ALL; + + cmp = string[soff] - bytes[1]; + + if (cmp == 0) { + if (slen < len) { + cmp = strncmp((const char *) string + soff + 1, + (const char *) bytes + 2, + slen - 1); + /* Prefix can never match */ + if (cmp == 0) + break; + } else { + cmp = strncmp((const char *) string + soff + 1, + (const char *) bytes + 2, + len - 1); + } + } - while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ - ((IS_LETTER(c)) || (IS_DIGIT(c)) || - (c == '.') || (c == '-') || - (c == '_') || (c == ':') || - (IS_COMBINING(c)) || - (IS_EXTENDER(c)))) { - if (count++ > 100) { - count = 0; - GROW; - } - len += l; - NEXTL(l); - c = CUR_CHAR(l); - if (ctxt->input->base != base) { - /* - * We changed encoding from an unknown encoding - * Input buffer changed location, so we better start again - */ - return(htmlParseNameComplex(ctxt)); - } - } + if (cmp < 0) { + right = mid; + } else if (cmp > 0) { + left = mid + 1; + } else { + int term = soff + len < slen ? string[soff + len] : 0; + int isAlnum, isTerm; + + isAlnum = IS_ALNUM(term); + isTerm = ((term == ';') || + ((bytes[0] & ENT_F_SEMICOLON) && + ((!isAttr) || + ((!isAlnum) && (term != '='))))); + + if (isTerm) { + match = bytes + len + 1; + matchLen = soff + len; + if (term == ';') + matchLen += 1; + } - if (ctxt->input->cur - ctxt->input->base < len) { - /* Sanity check */ - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "unexpected change of input buffer", NULL, NULL); - return (NULL); + if (bytes[0] & ENT_F_SUBTABLE) { + if (isTerm) + match += 2; + + if ((isAlnum) && (soff + len < slen)) { + left = mid + bytes[len + 1]; + right = left + bytes[len + 2]; + soff += len; + continue; + } + } + + break; + } } - return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); -} + if (match == NULL) + return(NULL); + *nlen = matchLen; + *rlen = match[0]; + return(match + 1); +} /** - * htmlParseHTMLAttribute: + * htmlParseData: * @ctxt: an HTML parser context - * @stop: a char stop value + * @mask: mask of terminating characters + * @comment: true if parsing a comment + * @refs: true if references are allowed + * @maxLength: maximum output length * - * parse an HTML attribute value till the stop (quote), if - * stop is 0 then it stops at the first space + * Parse data until terminator is reached. * - * Returns the attribute parsed or NULL + * Returns the parsed string or NULL in case of errors. */ static xmlChar * -htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { - xmlChar *buffer = NULL; - int buffer_size = 0; - xmlChar *out = NULL; - const xmlChar *name = NULL; - const xmlChar *cur = NULL; - const htmlEntityDesc * ent; - - /* - * allocate a translation buffer. - */ - buffer_size = HTML_PARSER_BUFFER_SIZE; - buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); +htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask, + int comment, int refs, int maxLength) { + xmlParserInputPtr input = ctxt->input; + xmlChar *ret = NULL; + xmlChar *buffer; + xmlChar utf8Char[4]; + size_t buffer_size; + size_t used; + int eof = PARSER_PROGRESSIVE(ctxt); + int line, col; + int termSkip = -1; + + used = 0; + buffer_size = ctxt->spaceMax; + buffer = (xmlChar *) ctxt->spaceTab; if (buffer == NULL) { - htmlErrMemory(ctxt, "buffer allocation failed\n"); - return(NULL); + buffer_size = 500; + buffer = xmlMalloc(buffer_size + 1); + if (buffer == NULL) { + htmlErrMemory(ctxt); + return(NULL); + } } - out = buffer; - /* - * Ok loop until we reach one of the ending chars - */ - while ((CUR != 0) && (CUR != stop)) { - if ((stop == 0) && (CUR == '>')) break; - if ((stop == 0) && (IS_BLANK_CH(CUR))) break; - if (CUR == '&') { - if (NXT(1) == '#') { - unsigned int c; - int bits; - - c = htmlParseCharRef(ctxt); - if (c < 0x80) - { *out++ = c; bits= -6; } - else if (c < 0x800) - { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } - else if (c < 0x10000) - { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } - else - { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } - - for ( ; bits >= 0; bits-= 6) { - *out++ = ((c >> bits) & 0x3F) | 0x80; - } + line = input->line; + col = input->col; - if (out - buffer > buffer_size - 100) { - int indx = out - buffer; + while (!PARSER_STOPPED(ctxt)) { + const xmlChar *chunk, *in, *repl; + size_t avail, chunkSize, extraSize; + int replSize; + int skip = 0; + int ncr = 0; + int ncrSize = 0; + int cp = 0; - growBuffer(buffer); - out = &buffer[indx]; - } - } else { - ent = htmlParseEntityRef(ctxt, &name); - if (name == NULL) { - *out++ = '&'; - if (out - buffer > buffer_size - 100) { - int indx = out - buffer; - - growBuffer(buffer); - out = &buffer[indx]; - } - } else if (ent == NULL) { - *out++ = '&'; - cur = name; - while (*cur != 0) { - if (out - buffer > buffer_size - 100) { - int indx = out - buffer; - - growBuffer(buffer); - out = &buffer[indx]; - } - *out++ = *cur++; - } - } else { - unsigned int c; - int bits; - - if (out - buffer > buffer_size - 100) { - int indx = out - buffer; - - growBuffer(buffer); - out = &buffer[indx]; - } - c = ent->value; - if (c < 0x80) - { *out++ = c; bits= -6; } - else if (c < 0x800) - { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } - else if (c < 0x10000) - { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } - else - { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } - - for ( ; bits >= 0; bits-= 6) { - *out++ = ((c >> bits) & 0x3F) | 0x80; - } - } - } - } else { - unsigned int c; - int bits, l; + chunk = input->cur; + avail = input->end - chunk; + in = chunk; - if (out - buffer > buffer_size - 100) { - int indx = out - buffer; + repl = BAD_CAST ""; + replSize = 0; - growBuffer(buffer); - out = &buffer[indx]; - } - c = CUR_CHAR(l); - if (c < 0x80) - { *out++ = c; bits= -6; } - else if (c < 0x800) - { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } - else if (c < 0x10000) - { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } - else - { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } + while (!PARSER_STOPPED(ctxt)) { + size_t j; + int cur, size; - for ( ; bits >= 0; bits-= 6) { - *out++ = ((c >> bits) & 0x3F) | 0x80; - } - NEXT; - } + if ((!eof) && (avail <= 64)) { + size_t oldAvail = avail; + size_t off = in - chunk; + + input->cur = in; + + xmlParserGrow(ctxt); + + in = input->cur; + chunk = in - off; + input->cur = chunk; + avail = input->end - in; + + if (oldAvail == avail) + eof = 1; + } + + if (avail == 0) { + termSkip = 0; + break; + } + + cur = *in; + size = 1; + col += 1; + + if (htmlMaskMatch(mask, cur)) { + if (comment) { + if (avail < 2) { + termSkip = 1; + } else if (in[1] == '-') { + if (avail < 3) { + termSkip = 2; + } else if (in[2] == '>') { + termSkip = 3; + } else if (in[2] == '!') { + if (avail < 4) + termSkip = 3; + else if (in[3] == '>') + termSkip = 4; + } + } + + if (termSkip >= 0) + break; + } else { + termSkip = 0; + break; + } + } + + if (ncr) { + int lc = cur | 0x20; + int digit; + + if ((cur >= '0') && (cur <= '9')) { + digit = cur - '0'; + } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) { + digit = (lc - 'a') + 10; + } else { + if (cur == ';') { + in += 1; + size += 1; + ncrSize += 1; + } + goto next_chunk; + } + + cp = cp * ncr + digit; + if (cp >= 0x110000) + cp = 0x110000; + + ncrSize += 1; + + goto next_char; + } + + switch (cur) { + case '&': + if (!refs) + break; + + j = 1; + + if ((j < avail) && (in[j] == '#')) { + j += 1; + if (j < avail) { + if ((in[j] | 0x20) == 'x') { + j += 1; + if ((j < avail) && (IS_HEX_DIGIT(in[j]))) { + ncr = 16; + size = 3; + ncrSize = 3; + cp = 0; + } + } else if (IS_ASCII_DIGIT(in[j])) { + ncr = 10; + size = 2; + ncrSize = 2; + cp = 0; + } + } + } else { + repl = htmlFindEntityPrefix(in + j, + avail - j, + /* isAttr */ 1, + &skip, &replSize); + if (repl != NULL) { + skip += 1; + goto next_chunk; + } + + skip = 0; + } + + break; + + case '\0': + skip = 1; + repl = BAD_CAST "\xEF\xBF\xBD"; + replSize = 3; + goto next_chunk; + + case '\n': + line += 1; + col = 1; + break; + + case '\r': + skip = 1; + if (in[1] != 0x0A) { + repl = BAD_CAST "\x0A"; + replSize = 1; + } + goto next_chunk; + + default: + if (cur < 0x80) + break; + + if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) { + xmlChar * guess; + + if (in > chunk) + goto next_chunk; + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + guess = NULL; +#else + guess = htmlFindEncoding(ctxt); +#endif + if (guess == NULL) { + xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); + } else { + xmlSwitchEncodingName(ctxt, (const char *) guess); + xmlFree(guess); + } + input->flags |= XML_INPUT_HAS_ENCODING; + + eof = PARSER_PROGRESSIVE(ctxt); + goto restart; + } + + size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0); + + if (size <= 0) { + skip = 1; + repl = BAD_CAST "\xEF\xBF\xBD"; + replSize = 3; + goto next_chunk; + } + + break; + } + +next_char: + in += size; + avail -= size; + } + +next_chunk: + if (ncrSize > 0) { + skip = ncrSize; + in -= ncrSize; + + repl = htmlCodePointToUtf8(cp, utf8Char, &replSize); + } + + chunkSize = in - chunk; + extraSize = chunkSize + replSize; + + if (extraSize > maxLength - used) { + htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT, + "value too long\n", NULL, NULL); + goto error; + } + + if (extraSize > buffer_size - used) { + size_t newSize = (used + extraSize) * 2; + xmlChar *tmp = xmlRealloc(buffer, newSize + 1); + + if (tmp == NULL) { + htmlErrMemory(ctxt); + goto error; + } + buffer = tmp; + buffer_size = newSize; + } + + if (chunkSize > 0) { + input->cur += chunkSize; + memcpy(buffer + used, chunk, chunkSize); + used += chunkSize; + } + + input->cur += skip; + if (replSize > 0) { + memcpy(buffer + used, repl, replSize); + used += replSize; + } + + SHRINK; + + if (termSkip >= 0) + break; + +restart: + ; + } + + if (termSkip > 0) { + input->cur += termSkip; + col += termSkip; + } + + input->line = line; + input->col = col; + + ret = xmlMalloc(used + 1); + if (ret == NULL) { + htmlErrMemory(ctxt); + } else { + memcpy(ret, buffer, used); + ret[used] = 0; } - *out = 0; - return(buffer); + +error: + ctxt->spaceTab = (void *) buffer; + ctxt->spaceMax = buffer_size; + + return(ret); } /** @@ -2862,49 +2883,14 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { * @ctxt: an HTML parser context * @str: location to store the entity name * - * parse an HTML ENTITY references + * DEPRECATED: Internal function, don't use. * - * [68] EntityRef ::= '&' Name ';' - * - * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, - * if non-NULL *str will have to be freed by the caller. + * Returns NULL. */ const htmlEntityDesc * -htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { - const xmlChar *name; - const htmlEntityDesc * ent = NULL; - - if (str != NULL) *str = NULL; - if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); - - if (CUR == '&') { - NEXT; - name = htmlParseName(ctxt); - if (name == NULL) { - htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, - "htmlParseEntityRef: no name\n", NULL, NULL); - } else { - GROW; - if (CUR == ';') { - if (str != NULL) - *str = name; - - /* - * Lookup the entity in the table. - */ - ent = htmlEntityLookup(name); - if (ent != NULL) /* OK that's ugly !!! */ - NEXT; - } else { - htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, - "htmlParseEntityRef: expecting ';'\n", - NULL, NULL); - if (str != NULL) - *str = name; - } - } - } - return(ent); +htmlParseEntityRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, + const xmlChar **str ATTRIBUTE_UNUSED) { + return(NULL); } /** @@ -2922,735 +2908,635 @@ htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { static xmlChar * htmlParseAttValue(htmlParserCtxtPtr ctxt) { xmlChar *ret = NULL; + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? + XML_MAX_HUGE_LENGTH : + XML_MAX_TEXT_LENGTH; if (CUR == '"') { - NEXT; - ret = htmlParseHTMLAttribute(ctxt, '"'); - if (CUR != '"') { - htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, - "AttValue: \" expected\n", NULL, NULL); - } else - NEXT; + SKIP(1); + ret = htmlParseData(ctxt, MASK_DQ, 0, 1, maxLength); + if (CUR == '"') + SKIP(1); } else if (CUR == '\'') { - NEXT; - ret = htmlParseHTMLAttribute(ctxt, '\''); - if (CUR != '\'') { - htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, - "AttValue: ' expected\n", NULL, NULL); - } else - NEXT; + SKIP(1); + ret = htmlParseData(ctxt, MASK_SQ, 0, 1, maxLength); + if (CUR == '\'') + SKIP(1); } else { - /* - * That's an HTMLism, the attribute value may not be quoted - */ - ret = htmlParseHTMLAttribute(ctxt, 0); - if (ret == NULL) { - htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, - "AttValue: no value found\n", NULL, NULL); - } + ret = htmlParseData(ctxt, MASK_WS_GT, 0, 1, maxLength); } return(ret); } -/** - * htmlParseSystemLiteral: - * @ctxt: an HTML parser context - * - * parse an HTML Literal - * - * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") - * - * Returns the SystemLiteral parsed or NULL - */ +static void +htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf, + int size, int mode) { + if ((ctxt->sax == NULL) || (ctxt->disableSAX)) + return; -static xmlChar * -htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { - size_t len = 0, startPosition = 0; - int err = 0; - int quote; - xmlChar *ret = NULL; + if ((mode == 0) || (mode == DATA_RCDATA) || + (ctxt->sax->cdataBlock == NULL)) { + if ((ctxt->name == NULL) || + (xmlStrEqual(ctxt->name, BAD_CAST "html")) || + (xmlStrEqual(ctxt->name, BAD_CAST "head"))) { + int i; - if ((CUR != '"') && (CUR != '\'')) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, - "SystemLiteral \" or ' expected\n", NULL, NULL); - return(NULL); - } - quote = CUR; - NEXT; + /* + * Add leading whitespace to html or head elements before + * calling htmlStartCharData. + */ + for (i = 0; i < size; i++) + if (!IS_WS_HTML(buf[i])) + break; + + if (i > 0) { + if (!ctxt->keepBlanks) { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, buf, i); + } else { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, buf, i); + } - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while ((CUR != 0) && (CUR != quote)) { - /* TODO: Handle UTF-8 */ - if (!IS_CHAR_CH(CUR)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Invalid char in SystemLiteral 0x%X\n", CUR); - err = 1; - } - NEXT; - len++; - } - if (CUR != quote) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished SystemLiteral\n", NULL, NULL); - } else { - NEXT; - if (err == 0) - ret = xmlStrndup((BASE_PTR+startPosition), len); - } + buf += i; + size -= i; + } - return(ret); + if (size <= 0) + return; + + htmlStartCharData(ctxt); + + if (PARSER_STOPPED(ctxt)) + return; + } + + if ((mode == 0) && + (!ctxt->keepBlanks) && + (areBlanks(ctxt, buf, size) > 0)) { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, buf, size); + } else { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, buf, size); + } + } else { + /* + * Insert as CDATA, which is the same as HTML_PRESERVE_NODE + */ + ctxt->sax->cdataBlock(ctxt->userData, buf, size); + } } /** - * htmlParsePubidLiteral: + * htmlParseCharData: * @ctxt: an HTML parser context + * @partial: true if the input buffer is incomplete * - * parse an HTML public literal + * Parse character data and references. * - * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" - * - * Returns the PubidLiteral parsed or NULL. + * Returns 1 if all data was parsed, 0 otherwise. */ -static xmlChar * -htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { - size_t len = 0, startPosition = 0; - int err = 0; - int quote; - xmlChar *ret = NULL; +static int +htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) { + xmlParserInputPtr input = ctxt->input; + xmlChar utf8Char[4]; + int complete = 0; + int done = 0; + int mode; + int eof = PARSER_PROGRESSIVE(ctxt); + int line, col; + + mode = ctxt->endCheckState; + + line = input->line; + col = input->col; + + while (!PARSER_STOPPED(ctxt)) { + const xmlChar *chunk, *in, *repl; + size_t avail; + int replSize; + int skip = 0; + int ncr = 0; + int ncrSize = 0; + int cp = 0; + + chunk = input->cur; + avail = input->end - chunk; + in = chunk; + + repl = BAD_CAST ""; + replSize = 0; + + while (!PARSER_STOPPED(ctxt)) { + size_t j; + int cur, size; + + if (avail <= 64) { + if (!eof) { + size_t oldAvail = avail; + size_t off = in - chunk; + + input->cur = in; + + xmlParserGrow(ctxt); + + in = input->cur; + chunk = in - off; + input->cur = chunk; + avail = input->end - in; + + if (oldAvail == avail) + eof = 1; + } - if ((CUR != '"') && (CUR != '\'')) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, - "PubidLiteral \" or ' expected\n", NULL, NULL); - return(NULL); - } - quote = CUR; - NEXT; + if (avail == 0) { + if ((partial) && (ncr)) { + in -= ncrSize; + ncrSize = 0; + } - /* - * Name ::= (Letter | '_') (NameChar)* - */ - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; + done = 1; + break; + } + } - while ((CUR != 0) && (CUR != quote)) { - if (!IS_PUBIDCHAR_CH(CUR)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Invalid char in PubidLiteral 0x%X\n", CUR); - err = 1; - } - len++; - NEXT; - } + /* Accelerator */ + if (!ncr) { + while (avail > 0) { + static const unsigned mask[8] = { + 0x00002401, 0x10002040, + 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF + }; + cur = *in; + if ((1u << (cur & 0x1F)) & mask[cur >> 5]) + break; + col += 1; + in += 1; + avail -= 1; + } - if (CUR != quote) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished PubidLiteral\n", NULL, NULL); - } else { - NEXT; - if (err == 0) - ret = xmlStrndup((BASE_PTR + startPosition), len); - } + if ((!eof) && (avail <= 64)) + continue; + if (avail == 0) + continue; + } - return(ret); -} + cur = *in; + size = 1; + col += 1; -/** - * htmlParseScript: - * @ctxt: an HTML parser context - * - * parse the content of an HTML SCRIPT or STYLE element - * http://www.w3.org/TR/html4/sgml/dtd.html#Script - * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet - * http://www.w3.org/TR/html4/types.html#type-script - * http://www.w3.org/TR/html4/types.html#h-6.15 - * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 - * - * Script data ( %Script; in the DTD) can be the content of the SCRIPT - * element and the value of intrinsic event attributes. User agents must - * not evaluate script data as HTML markup but instead must pass it on as - * data to a script engine. - * NOTES: - * - The content is passed like CDATA - * - the attributes for style and scripting "onXXX" are also described - * as CDATA but SGML allows entities references in attributes so their - * processing is identical as other attributes - */ -static void -htmlParseScript(htmlParserCtxtPtr ctxt) { - xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; - int nbchar = 0; - int cur,l; + if (ncr) { + int lc = cur | 0x20; + int digit; - SHRINK; - cur = CUR_CHAR(l); - while (cur != 0) { - if ((cur == '<') && (NXT(1) == '/')) { - /* - * One should break here, the specification is clear: - * Authors should therefore escape "recovery) { - if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, - xmlStrlen(ctxt->name)) == 0) - { - break; /* while */ + if ((cur >= '0') && (cur <= '9')) { + digit = cur - '0'; + } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) { + digit = (lc - 'a') + 10; } else { - htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, - "Element %s embeds close tag\n", - ctxt->name, NULL); - } - } else { - if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || - ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) - { - break; /* while */ + if (cur == ';') { + in += 1; + size += 1; + ncrSize += 1; + } + goto next_chunk; } + + cp = cp * ncr + digit; + if (cp >= 0x110000) + cp = 0x110000; + + ncrSize += 1; + + goto next_char; } - } - if (IS_CHAR(cur)) { - COPY_BUF(l,buf,nbchar,cur); - } else { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Invalid char in CDATA 0x%X\n", cur); + + switch (cur) { + case '<': + if (mode == 0) { + done = 1; + complete = 1; + goto next_chunk; + } + if (mode == DATA_PLAINTEXT) + break; + + j = 1; + if (j < avail) { + if ((mode == DATA_SCRIPT) && (in[j] == '!')) { + /* Check for comment start */ + + j += 1; + if ((j < avail) && (in[j] == '-')) { + j += 1; + if ((j < avail) && (in[j] == '-')) + mode = DATA_SCRIPT_ESC1; + } + } else { + int i = 0; + int solidus = 0; + + /* Check for tag */ + + if (in[j] == '/') { + j += 1; + solidus = 1; + } + + if ((solidus) || (mode == DATA_SCRIPT_ESC1)) { + while ((j < avail) && + (ctxt->name[i] != 0) && + (ctxt->name[i] == (in[j] | 0x20))) { + i += 1; + j += 1; + } + + if ((ctxt->name[i] == 0) && (j < avail)) { + int c = in[j]; + + if ((c == '>') || (c == '/') || + (IS_WS_HTML(c))) { + if ((mode == DATA_SCRIPT_ESC1) && + (!solidus)) { + mode = DATA_SCRIPT_ESC2; + } else if (mode == DATA_SCRIPT_ESC2) { + mode = DATA_SCRIPT_ESC1; + } else { + complete = 1; + done = 1; + goto next_chunk; + } + } + } + } + } + } + + if ((partial) && (j >= avail)) { + done = 1; + goto next_chunk; + } + + break; + + case '-': + if ((mode != DATA_SCRIPT_ESC1) && (mode != DATA_SCRIPT_ESC2)) + break; + + /* Check for comment end */ + + j = 1; + if ((j < avail) && (in[j] == '-')) { + j += 1; + if ((j < avail) && (in[j] == '>')) + mode = DATA_SCRIPT; + } + + if ((partial) && (j >= avail)) { + done = 1; + goto next_chunk; + } + + break; + + case '&': + if ((mode != 0) && (mode != DATA_RCDATA)) + break; + + j = 1; + + if ((j < avail) && (in[j] == '#')) { + j += 1; + if (j < avail) { + if ((in[j] | 0x20) == 'x') { + j += 1; + if ((j < avail) && (IS_HEX_DIGIT(in[j]))) { + ncr = 16; + size = 3; + ncrSize = 3; + cp = 0; + } + } else if (IS_ASCII_DIGIT(in[j])) { + ncr = 10; + size = 2; + ncrSize = 2; + cp = 0; + } + } + } else { + if (partial) { + int terminated = 0; + size_t i; + + /* + * ∳ has 33 bytes. + */ + for (i = 1; i < avail; i++) { + if ((i >= 32) || + (!IS_ASCII_LETTER(in[i]) && + ((i < 2) || !IS_ASCII_DIGIT(in[i])))) { + terminated = 1; + break; + } + } + + if (!terminated) { + done = 1; + goto next_chunk; + } + } + + repl = htmlFindEntityPrefix(in + j, + avail - j, + /* isAttr */ 0, + &skip, &replSize); + if (repl != NULL) { + skip += 1; + goto next_chunk; + } + + skip = 0; + } + + if ((partial) && (j >= avail)) { + done = 1; + goto next_chunk; + } + + break; + + case '\0': + skip = 1; + repl = BAD_CAST "\xEF\xBF\xBD"; + replSize = 3; + goto next_chunk; + + case '\n': + line += 1; + col = 1; + break; + + case '\r': + if (partial && avail < 2) { + done = 1; + goto next_chunk; + } + + skip = 1; + if (in[1] != 0x0A) { + repl = BAD_CAST "\x0A"; + replSize = 1; + } + goto next_chunk; + + default: + if (cur < 0x80) + break; + + if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) { + xmlChar * guess; + + if (in > chunk) + goto next_chunk; + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + guess = NULL; +#else + guess = htmlFindEncoding(ctxt); +#endif + if (guess == NULL) { + xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); + } else { + xmlSwitchEncodingName(ctxt, (const char *) guess); + xmlFree(guess); + } + input->flags |= XML_INPUT_HAS_ENCODING; + + eof = PARSER_PROGRESSIVE(ctxt); + goto restart; + } + + size = htmlValidateUtf8(ctxt, in, avail, partial); + + if ((partial) && (size == 0)) { + done = 1; + goto next_chunk; + } + + if (size <= 0) { + skip = 1; + repl = BAD_CAST "\xEF\xBF\xBD"; + replSize = 3; + goto next_chunk; + } + + break; + } + +next_char: + in += size; + avail -= size; } - if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { - buf[nbchar] = 0; - if (ctxt->sax->cdataBlock!= NULL) { - /* - * Insert as CDATA, which is the same as HTML_PRESERVE_NODE - */ - ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); - } else if (ctxt->sax->characters != NULL) { - ctxt->sax->characters(ctxt->userData, buf, nbchar); - } - nbchar = 0; - } - GROW; - NEXTL(l); - cur = CUR_CHAR(l); - } - if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { - buf[nbchar] = 0; - if (ctxt->sax->cdataBlock!= NULL) { - /* - * Insert as CDATA, which is the same as HTML_PRESERVE_NODE - */ - ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); - } else if (ctxt->sax->characters != NULL) { - ctxt->sax->characters(ctxt->userData, buf, nbchar); - } +next_chunk: + if (ncrSize > 0) { + skip = ncrSize; + in -= ncrSize; + + repl = htmlCodePointToUtf8(cp, utf8Char, &replSize); + } + + if (in > chunk) { + input->cur += in - chunk; + htmlCharDataSAXCallback(ctxt, chunk, in - chunk, mode); + } + + input->cur += skip; + if (replSize > 0) + htmlCharDataSAXCallback(ctxt, repl, replSize, mode); + + SHRINK; + + if (done) + break; + +restart: + ; } -} + input->line = line; + input->col = col; + + if (complete) + ctxt->endCheckState = 0; + else + ctxt->endCheckState = mode; + + return(complete); +} /** - * htmlParseCharDataInternal: + * htmlParseComment: * @ctxt: an HTML parser context - * @readahead: optional read ahead character in ascii range + * @bogus: true if this is a bogus comment * - * parse a CharData section. - * if we are within a CDATA section ']]>' marks an end of section. - * - * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) + * Parse an HTML comment */ - static void -htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) { - xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6]; - int nbchar = 0; - int cur, l; - int chunk = 0; - - if (readahead) - buf[nbchar++] = readahead; - - SHRINK; - cur = CUR_CHAR(l); - while (((cur != '<') || (ctxt->token == '<')) && - ((cur != '&') || (ctxt->token == '&')) && - (cur != 0)) { - if (!(IS_CHAR(cur))) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Invalid char in CDATA 0x%X\n", cur); - } else { - COPY_BUF(l,buf,nbchar,cur); - } - if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { - buf[nbchar] = 0; - - /* - * Ok the segment is to be consumed as chars. - */ - if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { - if (areBlanks(ctxt, buf, nbchar)) { - if (ctxt->keepBlanks) { - if (ctxt->sax->characters != NULL) - ctxt->sax->characters(ctxt->userData, buf, nbchar); - } else { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace(ctxt->userData, - buf, nbchar); - } - } else { - htmlCheckParagraph(ctxt); - if (ctxt->sax->characters != NULL) - ctxt->sax->characters(ctxt->userData, buf, nbchar); - } - } - nbchar = 0; - } - NEXTL(l); - chunk++; - if (chunk > HTML_PARSER_BUFFER_SIZE) { - chunk = 0; - SHRINK; - GROW; +htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) { + const xmlChar *comment = BAD_CAST ""; + xmlChar *buf = NULL; + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? + XML_MAX_HUGE_LENGTH : + XML_MAX_TEXT_LENGTH; + + if (bogus) { + buf = htmlParseData(ctxt, MASK_GT, 0, 0, maxLength); + if (CUR == '>') + SKIP(1); + comment = buf; + } else { + if (CUR == '>') { + SKIP(1); + } else if ((CUR == '-') && (NXT(1) == '>')) { + SKIP(2); + } else { + buf = htmlParseData(ctxt, MASK_DASH, 1, 0, maxLength); + comment = buf; } - cur = CUR_CHAR(l); - if (cur == 0) { - SHRINK; - GROW; - cur = CUR_CHAR(l); - } } - if (nbchar != 0) { - buf[nbchar] = 0; - /* - * Ok the segment is to be consumed as chars. - */ - if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { - if (areBlanks(ctxt, buf, nbchar)) { - if (ctxt->keepBlanks) { - if (ctxt->sax->characters != NULL) - ctxt->sax->characters(ctxt->userData, buf, nbchar); - } else { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace(ctxt->userData, - buf, nbchar); - } - } else { - htmlCheckParagraph(ctxt); - if (ctxt->sax->characters != NULL) - ctxt->sax->characters(ctxt->userData, buf, nbchar); - } - } - } else { - /* - * Loop detection - */ - if (cur == 0) - ctxt->instate = XML_PARSER_EOF; - } + if (comment == NULL) + return; + + if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && + (!ctxt->disableSAX)) + ctxt->sax->comment(ctxt->userData, comment); + + xmlFree(buf); } /** - * htmlParseCharData: + * htmlParseCharRef: * @ctxt: an HTML parser context * - * parse a CharData section. - * if we are within a CDATA section ']]>' marks an end of section. + * DEPRECATED: Internal function, don't use. * - * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) + * Returns 0 */ - -static void -htmlParseCharData(htmlParserCtxtPtr ctxt) { - htmlParseCharDataInternal(ctxt, 0); +int +htmlParseCharRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED) { + return(0); } + /** - * htmlParseExternalID: + * htmlParseDoctypeLiteral: * @ctxt: an HTML parser context - * @publicID: a xmlChar** receiving PubidLiteral - * - * Parse an External ID or a Public ID * - * [75] ExternalID ::= 'SYSTEM' S SystemLiteral - * | 'PUBLIC' S PubidLiteral S SystemLiteral + * Parse a DOCTYPE SYTSTEM or PUBLIC literal. * - * [83] PublicID ::= 'PUBLIC' S PubidLiteral - * - * Returns the function returns SystemLiteral and in the second - * case publicID receives PubidLiteral, is strict is off - * it is possible to return NULL and have publicID set. + * Returns the literal or NULL in case of error. */ static xmlChar * -htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { - xmlChar *URI = NULL; +htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) { + xmlChar *ret; + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? + XML_MAX_TEXT_LENGTH : + XML_MAX_NAME_LENGTH; - if ((UPPER == 'S') && (UPP(1) == 'Y') && - (UPP(2) == 'S') && (UPP(3) == 'T') && - (UPP(4) == 'E') && (UPP(5) == 'M')) { - SKIP(6); - if (!IS_BLANK_CH(CUR)) { - htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, - "Space required after 'SYSTEM'\n", NULL, NULL); - } - SKIP_BLANKS; - URI = htmlParseSystemLiteral(ctxt); - if (URI == NULL) { - htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, - "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); - } - } else if ((UPPER == 'P') && (UPP(1) == 'U') && - (UPP(2) == 'B') && (UPP(3) == 'L') && - (UPP(4) == 'I') && (UPP(5) == 'C')) { - SKIP(6); - if (!IS_BLANK_CH(CUR)) { - htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, - "Space required after 'PUBLIC'\n", NULL, NULL); - } - SKIP_BLANKS; - *publicID = htmlParsePubidLiteral(ctxt); - if (*publicID == NULL) { - htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, - "htmlParseExternalID: PUBLIC, no Public Identifier\n", - NULL, NULL); - } - SKIP_BLANKS; - if ((CUR == '"') || (CUR == '\'')) { - URI = htmlParseSystemLiteral(ctxt); - } + if (CUR == '"') { + SKIP(1); + ret = htmlParseData(ctxt, MASK_DQ_GT, 0, 0, maxLength); + if (CUR == '"') + SKIP(1); + } else if (CUR == '\'') { + SKIP(1); + ret = htmlParseData(ctxt, MASK_SQ_GT, 0, 0, maxLength); + if (CUR == '\'') + SKIP(1); + } else { + return(NULL); } - return(URI); + + return(ret); } -/** - * xmlParsePI: - * @ctxt: an XML parser context - * - * parse an XML Processing Instruction. - * - * [16] PI ::= '' Char*)))? '?>' - */ static void -htmlParsePI(htmlParserCtxtPtr ctxt) { - xmlChar *buf = NULL; - int len = 0; - int size = HTML_PARSER_BUFFER_SIZE; - int cur, l; - const xmlChar *target; - xmlParserInputState state; - int count = 0; - - if ((RAW == '<') && (NXT(1) == '?')) { - state = ctxt->instate; - ctxt->instate = XML_PARSER_PI; - /* - * this is a Processing Instruction. - */ - SKIP(2); - SHRINK; +htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt) { + const xmlChar *in; + size_t avail; + int eof = PARSER_PROGRESSIVE(ctxt); + int line, col; - /* - * Parse the target name and check for special support like - * namespace. - */ - target = htmlParseName(ctxt); - if (target != NULL) { - if (RAW == '>') { - SKIP(1); + line = ctxt->input->line; + col = ctxt->input->col; - /* - * SAX: PI detected. - */ - if ((ctxt->sax) && (!ctxt->disableSAX) && - (ctxt->sax->processingInstruction != NULL)) - ctxt->sax->processingInstruction(ctxt->userData, - target, NULL); - ctxt->instate = state; - return; - } - buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); - if (buf == NULL) { - htmlErrMemory(ctxt, NULL); - ctxt->instate = state; - return; - } - cur = CUR; - if (!IS_BLANK(cur)) { - htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, - "ParsePI: PI %s space expected\n", target, NULL); - } - SKIP_BLANKS; - cur = CUR_CHAR(l); - while ((cur != 0) && (cur != '>')) { - if (len + 5 >= size) { - xmlChar *tmp; - - size *= 2; - tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); - if (tmp == NULL) { - htmlErrMemory(ctxt, NULL); - xmlFree(buf); - ctxt->instate = state; - return; - } - buf = tmp; - } - count++; - if (count > 50) { - GROW; - count = 0; - } - if (IS_CHAR(cur)) { - COPY_BUF(l,buf,len,cur); - } else { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Invalid char in processing instruction " - "0x%X\n", cur); - } - NEXTL(l); - cur = CUR_CHAR(l); - if (cur == 0) { - SHRINK; - GROW; - cur = CUR_CHAR(l); - } - } - buf[len] = 0; - if (cur != '>') { - htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, - "ParsePI: PI %s never end ...\n", target, NULL); - } else { - SKIP(1); - - /* - * SAX: PI detected. - */ - if ((ctxt->sax) && (!ctxt->disableSAX) && - (ctxt->sax->processingInstruction != NULL)) - ctxt->sax->processingInstruction(ctxt->userData, - target, buf); - } - xmlFree(buf); - } else { - htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, - "PI is not started correctly", NULL, NULL); - } - ctxt->instate = state; - } -} + in = ctxt->input->cur; + avail = ctxt->input->end - in; -/** - * htmlParseComment: - * @ctxt: an HTML parser context - * - * Parse an XML (SGML) comment - * - * [15] Comment ::= '' - */ -static void -htmlParseComment(htmlParserCtxtPtr ctxt) { - xmlChar *buf = NULL; - int len; - int size = HTML_PARSER_BUFFER_SIZE; - int q, ql; - int r, rl; - int cur, l; - int next, nl; - xmlParserInputState state; + while (!PARSER_STOPPED(ctxt)) { + int cur; - /* - * Check that there is a comment right here. - */ - if ((RAW != '<') || (NXT(1) != '!') || - (NXT(2) != '-') || (NXT(3) != '-')) return; + if ((!eof) && (avail <= 64)) { + size_t oldAvail = avail; - state = ctxt->instate; - ctxt->instate = XML_PARSER_COMMENT; - SHRINK; - SKIP(4); - buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); - if (buf == NULL) { - htmlErrMemory(ctxt, "buffer allocation failed\n"); - ctxt->instate = state; - return; - } - len = 0; - buf[len] = 0; - q = CUR_CHAR(ql); - if (q == 0) - goto unfinished; - NEXTL(ql); - r = CUR_CHAR(rl); - if (r == 0) - goto unfinished; - NEXTL(rl); - cur = CUR_CHAR(l); - while ((cur != 0) && - ((cur != '>') || - (r != '-') || (q != '-'))) { - NEXTL(l); - next = CUR_CHAR(nl); - if (next == 0) { - SHRINK; - GROW; - next = CUR_CHAR(nl); - } + ctxt->input->cur = in; - if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) { - htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, - "Comment incorrectly closed by '--!>'", NULL, NULL); - cur = '>'; - break; - } + xmlParserGrow(ctxt); - if (len + 5 >= size) { - xmlChar *tmp; + in = ctxt->input->cur; + avail = ctxt->input->end - in; - size *= 2; - tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); - if (tmp == NULL) { - xmlFree(buf); - htmlErrMemory(ctxt, "growing buffer failed\n"); - ctxt->instate = state; - return; - } - buf = tmp; - } - if (IS_CHAR(q)) { - COPY_BUF(ql,buf,len,q); - } else { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Invalid char in comment 0x%X\n", q); + if (oldAvail == avail) + eof = 1; } - q = r; - ql = rl; - r = cur; - rl = l; - cur = next; - l = nl; - } - buf[len] = 0; - if (cur == '>') { - NEXT; - if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && - (!ctxt->disableSAX)) - ctxt->sax->comment(ctxt->userData, buf); - xmlFree(buf); - ctxt->instate = state; - return; - } + if (avail == 0) + break; -unfinished: - htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, - "Comment not terminated \n is a complete comment, but + * is not + * is + */ + if ((NXT(mark+2) == '>') || + ((mark >= 4) && (NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) { + ctxt->checkIndex = 0; + break; } - ctxt->checkIndex = cur + mark + 1; + offset = (NXT(mark+2) == '!') ? 3 : 2; + if (mark + offset >= ctxt->input->end - ctxt->input->cur) { + ctxt->checkIndex = mark; + return(-1); + } + ctxt->checkIndex = mark + 1; } return mark; } @@ -5439,843 +5021,277 @@ htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt) * * Returns zero if no parsing was possible */ -static int +static void htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { - int ret = 0; - htmlParserInputPtr in; - ptrdiff_t avail = 0; - xmlChar cur, next; - - htmlParserNodeInfo node_info; - -#ifdef DEBUG_PUSH - switch (ctxt->instate) { - case XML_PARSER_EOF: - xmlGenericError(xmlGenericErrorContext, - "HPP: try EOF\n"); break; - case XML_PARSER_START: - xmlGenericError(xmlGenericErrorContext, - "HPP: try START\n"); break; - case XML_PARSER_MISC: - xmlGenericError(xmlGenericErrorContext, - "HPP: try MISC\n");break; - case XML_PARSER_COMMENT: - xmlGenericError(xmlGenericErrorContext, - "HPP: try COMMENT\n");break; - case XML_PARSER_PROLOG: - xmlGenericError(xmlGenericErrorContext, - "HPP: try PROLOG\n");break; - case XML_PARSER_START_TAG: - xmlGenericError(xmlGenericErrorContext, - "HPP: try START_TAG\n");break; - case XML_PARSER_CONTENT: - xmlGenericError(xmlGenericErrorContext, - "HPP: try CONTENT\n");break; - case XML_PARSER_CDATA_SECTION: - xmlGenericError(xmlGenericErrorContext, - "HPP: try CDATA_SECTION\n");break; - case XML_PARSER_END_TAG: - xmlGenericError(xmlGenericErrorContext, - "HPP: try END_TAG\n");break; - case XML_PARSER_ENTITY_DECL: - xmlGenericError(xmlGenericErrorContext, - "HPP: try ENTITY_DECL\n");break; - case XML_PARSER_ENTITY_VALUE: - xmlGenericError(xmlGenericErrorContext, - "HPP: try ENTITY_VALUE\n");break; - case XML_PARSER_ATTRIBUTE_VALUE: - xmlGenericError(xmlGenericErrorContext, - "HPP: try ATTRIBUTE_VALUE\n");break; - case XML_PARSER_DTD: - xmlGenericError(xmlGenericErrorContext, - "HPP: try DTD\n");break; - case XML_PARSER_EPILOG: - xmlGenericError(xmlGenericErrorContext, - "HPP: try EPILOG\n");break; - case XML_PARSER_PI: - xmlGenericError(xmlGenericErrorContext, - "HPP: try PI\n");break; - case XML_PARSER_SYSTEM_LITERAL: - xmlGenericError(xmlGenericErrorContext, - "HPP: try SYSTEM_LITERAL\n");break; - } -#endif - - while (1) { + while (PARSER_STOPPED(ctxt) == 0) { + htmlParserInputPtr in; + size_t avail; in = ctxt->input; if (in == NULL) break; - if (in->buf == NULL) - avail = in->length - (in->cur - in->base); - else - avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - - (in->cur - in->base); - if ((avail == 0) && (terminate)) { - htmlAutoCloseOnEnd(ctxt); - if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { - /* - * SAX: end of the document processing. - */ - ctxt->instate = XML_PARSER_EOF; - if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) - ctxt->sax->endDocument(ctxt->userData); - } - } - if (avail < 1) - goto done; - /* - * This is done to make progress and avoid an infinite loop - * if a parsing attempt was aborted by hitting a NUL byte. After - * changing htmlCurrentChar, this probably isn't necessary anymore. - * We should consider removing this check. - */ - cur = in->cur[0]; - if (cur == 0) { - SKIP(1); - continue; - } + avail = in->end - in->cur; switch (ctxt->instate) { case XML_PARSER_EOF: /* * Document parsing is done ! */ - goto done; + return; + case XML_PARSER_START: - /* - * Very first chars read from the document flow. - */ - cur = in->cur[0]; - if (IS_BLANK_CH(cur)) { - SKIP_BLANKS; - if (in->buf == NULL) - avail = in->length - (in->cur - in->base); - else - avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - - (in->cur - in->base); - } - if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) - ctxt->sax->setDocumentLocator(ctxt->userData, - &xmlDefaultSAXLocator); - if ((ctxt->sax) && (ctxt->sax->startDocument) && - (!ctxt->disableSAX)) - ctxt->sax->startDocument(ctxt->userData); + /* + * Very first chars read from the document flow. + */ + if ((!terminate) && (avail < 4)) + return; - cur = in->cur[0]; - next = in->cur[1]; - if ((cur == '<') && (next == '!') && - (UPP(2) == 'D') && (UPP(3) == 'O') && - (UPP(4) == 'C') && (UPP(5) == 'T') && - (UPP(6) == 'Y') && (UPP(7) == 'P') && - (UPP(8) == 'E')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) - goto done; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing internal subset\n"); -#endif - htmlParseDocTypeDecl(ctxt); - ctxt->instate = XML_PARSER_PROLOG; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering PROLOG\n"); -#endif - } else { - ctxt->instate = XML_PARSER_MISC; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering MISC\n"); -#endif - } - break; - case XML_PARSER_MISC: - SKIP_BLANKS; - if (in->buf == NULL) - avail = in->length - (in->cur - in->base); - else - avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - - (in->cur - in->base); - /* - * no chars in buffer - */ - if (avail < 1) - goto done; - /* - * not enough chars in buffer - */ - if (avail < 2) { - if (!terminate) - goto done; - else - next = ' '; - } else { - next = in->cur[1]; - } - cur = in->cur[0]; - if ((cur == '<') && (next == '!') && - (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) - goto done; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing Comment\n"); -#endif - htmlParseComment(ctxt); - ctxt->instate = XML_PARSER_MISC; - } else if ((cur == '<') && (next == '?')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) - goto done; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing PI\n"); -#endif - htmlParsePI(ctxt); - ctxt->instate = XML_PARSER_MISC; - } else if ((cur == '<') && (next == '!') && - (UPP(2) == 'D') && (UPP(3) == 'O') && - (UPP(4) == 'C') && (UPP(5) == 'T') && - (UPP(6) == 'Y') && (UPP(7) == 'P') && - (UPP(8) == 'E')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) - goto done; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing internal subset\n"); -#endif - htmlParseDocTypeDecl(ctxt); - ctxt->instate = XML_PARSER_PROLOG; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering PROLOG\n"); -#endif - } else if ((cur == '<') && (next == '!') && - (avail < 9)) { - goto done; - } else { - ctxt->instate = XML_PARSER_CONTENT; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering START_TAG\n"); -#endif - } - break; - case XML_PARSER_PROLOG: - SKIP_BLANKS; - if (in->buf == NULL) - avail = in->length - (in->cur - in->base); - else - avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - - (in->cur - in->base); - if (avail < 2) - goto done; - cur = in->cur[0]; - next = in->cur[1]; - if ((cur == '<') && (next == '!') && - (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) - goto done; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing Comment\n"); -#endif - htmlParseComment(ctxt); - ctxt->instate = XML_PARSER_PROLOG; - } else if ((cur == '<') && (next == '?')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) - goto done; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing PI\n"); -#endif - htmlParsePI(ctxt); - ctxt->instate = XML_PARSER_PROLOG; - } else if ((cur == '<') && (next == '!') && - (avail < 4)) { - goto done; - } else { - ctxt->instate = XML_PARSER_CONTENT; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering START_TAG\n"); -#endif - } - break; - case XML_PARSER_EPILOG: - if (in->buf == NULL) - avail = in->length - (in->cur - in->base); - else - avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - - (in->cur - in->base); - if (avail < 1) - goto done; - cur = in->cur[0]; - if (IS_BLANK_CH(cur)) { - htmlParseCharData(ctxt); - goto done; - } - if (avail < 2) - goto done; - next = in->cur[1]; - if ((cur == '<') && (next == '!') && - (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) - goto done; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing Comment\n"); -#endif - htmlParseComment(ctxt); - ctxt->instate = XML_PARSER_EPILOG; - } else if ((cur == '<') && (next == '?')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) - goto done; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing PI\n"); -#endif - htmlParsePI(ctxt); - ctxt->instate = XML_PARSER_EPILOG; - } else if ((cur == '<') && (next == '!') && - (avail < 4)) { - goto done; - } else { - ctxt->errNo = XML_ERR_DOCUMENT_END; - ctxt->wellFormed = 0; - ctxt->instate = XML_PARSER_EOF; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering EOF\n"); -#endif - if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) - ctxt->sax->endDocument(ctxt->userData); - goto done; - } - break; - case XML_PARSER_START_TAG: { - const xmlChar *name; - int failed; - const htmlElemDesc * info; + xmlDetectEncoding(ctxt); - /* - * no chars in buffer - */ - if (avail < 1) - goto done; - /* - * not enough chars in buffer - */ - if (avail < 2) { - if (!terminate) - goto done; - else - next = ' '; - } else { - next = in->cur[1]; - } - cur = in->cur[0]; - if (cur != '<') { - ctxt->instate = XML_PARSER_CONTENT; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif - break; - } - if (next == '/') { - ctxt->instate = XML_PARSER_END_TAG; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering END_TAG\n"); -#endif - break; - } - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) - goto done; - - /* Capture start position */ - if (ctxt->record_info) { - node_info.begin_pos = ctxt->input->consumed + - (CUR_PTR - ctxt->input->base); - node_info.begin_line = ctxt->input->line; - } - - - failed = htmlParseStartTag(ctxt); - name = ctxt->name; - if ((failed == -1) || - (name == NULL)) { - if (CUR == '>') - NEXT; - break; - } + /* + * TODO: Implement HTML5 prescan algorithm + */ - /* - * Lookup the info for that element. - */ - info = htmlTagLookup(name); - if (info == NULL) { - htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, - "Tag %s invalid\n", name, NULL); - } + /* + * This is wrong but matches long-standing behavior. In most + * cases, a document starting with an XML declaration will + * specify UTF-8. The HTML5 prescan algorithm handles + * XML declarations in a better way. + */ + if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) && + (xmlStrncmp(ctxt->input->cur, BAD_CAST "')) { - SKIP(2); - if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) - ctxt->sax->endElement(ctxt->userData, name); - htmlnamePop(ctxt); - ctxt->instate = XML_PARSER_CONTENT; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif - break; - } + /* fall through */ - if (CUR == '>') { - NEXT; - } else { - htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, - "Couldn't find end of Start Tag %s\n", - name, NULL); - - /* - * end of parsing of this node. - */ - if (xmlStrEqual(name, ctxt->name)) { - nodePop(ctxt); - htmlnamePop(ctxt); - } - - if (ctxt->record_info) - htmlNodeInfoPush(ctxt, &node_info); - - ctxt->instate = XML_PARSER_CONTENT; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif - break; - } + case XML_PARSER_XML_DECL: + if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) { + ctxt->sax->setDocumentLocator(ctxt->userData, + (xmlSAXLocator *) &xmlDefaultSAXLocator); + } + if ((ctxt->sax) && (ctxt->sax->startDocument) && + (!ctxt->disableSAX)) + ctxt->sax->startDocument(ctxt->userData); - /* - * Check for an Empty Element from DTD definition - */ - if ((info != NULL) && (info->empty)) { - if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) - ctxt->sax->endElement(ctxt->userData, name); - htmlnamePop(ctxt); - } + /* Allow callback to modify state for tests */ + if ((ctxt->instate == XML_PARSER_START) || + (ctxt->instate == XML_PARSER_XML_DECL)) + ctxt->instate = XML_PARSER_MISC; + break; + + case XML_PARSER_START_TAG: + if ((!terminate) && + (htmlParseLookupGt(ctxt) < 0)) + return; - if (ctxt->record_info) - htmlNodeInfoPush(ctxt, &node_info); + htmlParseElementInternal(ctxt); ctxt->instate = XML_PARSER_CONTENT; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif break; - } + + case XML_PARSER_MISC: /* initial */ + case XML_PARSER_PROLOG: /* before html */ case XML_PARSER_CONTENT: { - xmlChar chr[2] = { 0, 0 }; + int mode; + if ((ctxt->instate == XML_PARSER_MISC) || + (ctxt->instate == XML_PARSER_PROLOG)) { + SKIP_BLANKS; + avail = in->end - in->cur; + } + + if (avail < 1) + return; /* - * Handle preparsed entities and charRef - */ - if (ctxt->token != 0) { - chr[0] = (xmlChar) ctxt->token; - htmlCheckParagraph(ctxt); - if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) - ctxt->sax->characters(ctxt->userData, chr, 1); - ctxt->token = 0; - ctxt->checkIndex = 0; - } - if ((avail == 1) && (terminate)) { - cur = in->cur[0]; - if ((cur != '<') && (cur != '&')) { - if (ctxt->sax != NULL) { - chr[0] = cur; - if (IS_BLANK_CH(cur)) { - if (ctxt->keepBlanks) { - if (ctxt->sax->characters != NULL) - ctxt->sax->characters( - ctxt->userData, chr, 1); - } else { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace( - ctxt->userData, chr, 1); - } - } else { - htmlCheckParagraph(ctxt); - if (ctxt->sax->characters != NULL) - ctxt->sax->characters( - ctxt->userData, chr, 1); - } - } - ctxt->token = 0; - ctxt->checkIndex = 0; - in->cur++; - break; - } - } - if (avail < 2) - goto done; - cur = in->cur[0]; - next = in->cur[1]; - if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || - (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { - /* - * Handle SCRIPT/STYLE separately - */ - if (!terminate) { - int idx; - xmlChar val; - - idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0); - if (idx < 0) - goto done; - val = in->cur[idx + 2]; - if (val == 0) /* bad cut of input */ - goto done; - } - htmlParseScript(ctxt); - if ((cur == '<') && (next == '/')) { - ctxt->instate = XML_PARSER_END_TAG; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering END_TAG\n"); -#endif - break; - } - } else { - /* - * Sometimes DOCTYPE arrives in the middle of the document - */ - if ((cur == '<') && (next == '!') && - (UPP(2) == 'D') && (UPP(3) == 'O') && - (UPP(4) == 'C') && (UPP(5) == 'T') && - (UPP(6) == 'Y') && (UPP(7) == 'P') && - (UPP(8) == 'E')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) - goto done; - htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, - "Misplaced DOCTYPE declaration\n", - BAD_CAST "DOCTYPE" , NULL); - htmlParseDocTypeDecl(ctxt); - } else if ((cur == '<') && (next == '!') && - (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) - goto done; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing Comment\n"); -#endif - htmlParseComment(ctxt); - ctxt->instate = XML_PARSER_CONTENT; - } else if ((cur == '<') && (next == '?')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) - goto done; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing PI\n"); -#endif - htmlParsePI(ctxt); - ctxt->instate = XML_PARSER_CONTENT; - } else if ((cur == '<') && (next == '!') && (avail < 4)) { - goto done; - } else if ((cur == '<') && (next == '/')) { - ctxt->instate = XML_PARSER_END_TAG; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering END_TAG\n"); -#endif - break; - } else if (cur == '<') { - if ((!terminate) && (next == 0)) - goto done; + * Note that endCheckState is also used by + * xmlParseLookupGt. + */ + mode = ctxt->endCheckState; + + if (mode != 0) { + if (htmlParseCharData(ctxt, !terminate) == 0) + return; + } else if (in->cur[0] == '<') { + int next; + + if (avail < 2) { + if (!terminate) + return; + next = ' '; + } else { + next = in->cur[1]; + } + + if (next == '!') { + if ((!terminate) && (avail < 4)) + return; + if ((in->cur[2] == '-') && (in->cur[3] == '-')) { + if ((!terminate) && + (htmlParseLookupCommentEnd(ctxt) < 0)) + return; + SKIP(4); + htmlParseComment(ctxt, /* bogus */ 0); + /* don't change state */ + break; + } + + if ((!terminate) && (avail < 9)) + return; + if ((UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { + if ((!terminate) && + (htmlParseLookupString(ctxt, 9, ">", 1, + 0) < 0)) + return; + htmlParseDocTypeDecl(ctxt); + if (ctxt->instate == XML_PARSER_MISC) + ctxt->instate = XML_PARSER_PROLOG; + else + ctxt->instate = XML_PARSER_CONTENT; + } else { + ctxt->instate = XML_PARSER_CONTENT; + if ((!terminate) && + (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0)) + return; + SKIP(2); + htmlParseComment(ctxt, /* bogus */ 1); + } + } else if (next == '?') { + if ((!terminate) && + (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0)) + return; + SKIP(1); + htmlParseComment(ctxt, /* bogus */ 1); + /* don't change state */ + } else if (next == '/') { + ctxt->instate = XML_PARSER_END_TAG; + ctxt->checkIndex = 0; + } else if (IS_ASCII_LETTER(next)) { ctxt->instate = XML_PARSER_START_TAG; ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering START_TAG\n"); -#endif - break; - } else { - /* - * check that the text sequence is complete - * before handing out the data to the parser - * to avoid problems with erroneous end of - * data detection. - */ - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0)) - goto done; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing char data\n"); -#endif - while ((ctxt->instate != XML_PARSER_EOF) && - (cur != '<') && (in->cur < in->end)) { - if (cur == '&') { - htmlParseReference(ctxt); - } else { - htmlParseCharData(ctxt); - } - cur = in->cur[0]; - } - } + } else { + ctxt->instate = XML_PARSER_CONTENT; + htmlStartCharData(ctxt); + if ((ctxt->sax != NULL) && (!ctxt->disableSAX) && + (ctxt->sax->characters != NULL)) + ctxt->sax->characters(ctxt->userData, + BAD_CAST "<", 1); + SKIP(1); + } + } else { + ctxt->instate = XML_PARSER_CONTENT; + /* + * We follow the logic of the XML push parser + */ + if (avail < HTML_PARSER_BIG_BUFFER_SIZE) { + if ((!terminate) && + (htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0)) + return; + } + ctxt->checkIndex = 0; + if (htmlParseCharData(ctxt, !terminate) == 0) + return; } break; } + case XML_PARSER_END_TAG: - if (avail < 2) - goto done; if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) - goto done; + (htmlParseLookupGt(ctxt) < 0)) + return; htmlParseEndTag(ctxt); - if (ctxt->nameNr == 0) { - ctxt->instate = XML_PARSER_EPILOG; - } else { - ctxt->instate = XML_PARSER_CONTENT; - } - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif - break; - case XML_PARSER_CDATA_SECTION: - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "HPP: internal error, state == CDATA\n", - NULL, NULL); - ctxt->instate = XML_PARSER_CONTENT; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif - break; - case XML_PARSER_DTD: - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "HPP: internal error, state == DTD\n", - NULL, NULL); - ctxt->instate = XML_PARSER_CONTENT; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif - break; - case XML_PARSER_COMMENT: - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "HPP: internal error, state == COMMENT\n", - NULL, NULL); ctxt->instate = XML_PARSER_CONTENT; ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif - break; - case XML_PARSER_PI: - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "HPP: internal error, state == PI\n", - NULL, NULL); - ctxt->instate = XML_PARSER_CONTENT; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif - break; - case XML_PARSER_ENTITY_DECL: - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "HPP: internal error, state == ENTITY_DECL\n", - NULL, NULL); - ctxt->instate = XML_PARSER_CONTENT; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif - break; - case XML_PARSER_ENTITY_VALUE: - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "HPP: internal error, state == ENTITY_VALUE\n", - NULL, NULL); - ctxt->instate = XML_PARSER_CONTENT; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering DTD\n"); -#endif - break; - case XML_PARSER_ATTRIBUTE_VALUE: - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "HPP: internal error, state == ATTRIBUTE_VALUE\n", - NULL, NULL); - ctxt->instate = XML_PARSER_START_TAG; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering START_TAG\n"); -#endif - break; - case XML_PARSER_SYSTEM_LITERAL: - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", - NULL, NULL); - ctxt->instate = XML_PARSER_CONTENT; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif - break; - case XML_PARSER_IGNORE: - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "HPP: internal error, state == XML_PARSER_IGNORE\n", - NULL, NULL); - ctxt->instate = XML_PARSER_CONTENT; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif - break; - case XML_PARSER_PUBLIC_LITERAL: + break; + + default: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "HPP: internal error, state == XML_PARSER_LITERAL\n", - NULL, NULL); - ctxt->instate = XML_PARSER_CONTENT; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering CONTENT\n"); -#endif + "HPP: internal error\n", NULL, NULL); + ctxt->instate = XML_PARSER_EOF; break; - - } - } -done: - if ((avail == 0) && (terminate)) { - htmlAutoCloseOnEnd(ctxt); - if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { - /* - * SAX: end of the document processing. - */ - ctxt->instate = XML_PARSER_EOF; - if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) - ctxt->sax->endDocument(ctxt->userData); } } - if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) && - ((terminate) || (ctxt->instate == XML_PARSER_EOF) || - (ctxt->instate == XML_PARSER_EPILOG))) { - xmlDtdPtr dtd; - dtd = xmlGetIntSubset(ctxt->myDoc); - if (dtd == NULL) - ctxt->myDoc->intSubset = - xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", - BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", - BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); - } -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); -#endif - return(ret); } /** * htmlParseChunk: * @ctxt: an HTML parser context - * @chunk: an char array - * @size: the size in byte of the chunk + * @chunk: chunk of memory + * @size: size of chunk in bytes * @terminate: last chunk indicator * - * Parse a Chunk of memory + * Parse a chunk of memory in push parser mode. * - * Returns zero if no error, the xmlParserErrors otherwise. + * Assumes that the parser context was initialized with + * htmlCreatePushParserCtxt. + * + * The last chunk, which will often be empty, must be marked with + * the @terminate flag. With the default SAX callbacks, the resulting + * document will be available in ctxt->myDoc. This pointer will not + * be freed by the library. + * + * If the document isn't well-formed, ctxt->myDoc is set to NULL. + * + * Returns an xmlParserErrors code (0 on success). */ int htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, int terminate) { - if ((ctxt == NULL) || (ctxt->input == NULL)) { - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "htmlParseChunk: context error\n", NULL, NULL); - return(XML_ERR_INTERNAL_ERROR); - } - if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && - (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { - size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); - size_t cur = ctxt->input->cur - ctxt->input->base; + if ((ctxt == NULL) || + (ctxt->input == NULL) || (ctxt->input->buf == NULL) || + (size < 0) || + ((size > 0) && (chunk == NULL))) + return(XML_ERR_ARGUMENT); + if (PARSER_STOPPED(ctxt) != 0) + return(ctxt->errNo); + + if (size > 0) { + size_t pos = ctxt->input->cur - ctxt->input->base; int res; res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); - xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); + xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos); if (res < 0) { - ctxt->errNo = XML_PARSER_EOF; - ctxt->disableSAX = 1; - return (XML_PARSER_EOF); - } -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); -#endif - -#if 0 - if ((terminate) || (ctxt->input->buf->buffer->use > 80)) - htmlParseTryOrFinish(ctxt, terminate); -#endif - } else if (ctxt->instate != XML_PARSER_EOF) { - if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { - xmlParserInputBufferPtr in = ctxt->input->buf; - if ((in->encoder != NULL) && (in->buffer != NULL) && - (in->raw != NULL)) { - int nbchars; - size_t base = xmlBufGetInputBase(in->buffer, ctxt->input); - size_t current = ctxt->input->cur - ctxt->input->base; - - nbchars = xmlCharEncInput(in, terminate); - xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current); - if (nbchars < 0) { - htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, - "encoder error\n", NULL, NULL); - return(XML_ERR_INVALID_ENCODING); - } - } + htmlParseErr(ctxt, ctxt->input->buf->error, + "xmlParserInputBufferPush failed", NULL, NULL); + xmlHaltParser(ctxt); + return (ctxt->errNo); } } + htmlParseTryOrFinish(ctxt, terminate); - if (terminate) { - if ((ctxt->instate != XML_PARSER_EOF) && - (ctxt->instate != XML_PARSER_EPILOG) && - (ctxt->instate != XML_PARSER_MISC)) { - ctxt->errNo = XML_ERR_DOCUMENT_END; - ctxt->wellFormed = 0; - } - if (ctxt->instate != XML_PARSER_EOF) { - if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) - ctxt->sax->endDocument(ctxt->userData); - } + + if ((terminate) && (ctxt->instate != XML_PARSER_EOF)) { + htmlAutoCloseOnEnd(ctxt); + + /* + * Only check for truncated multi-byte sequences + */ + xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR); + + if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) + ctxt->sax->endDocument(ctxt->userData); + + if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && + (ctxt->myDoc != NULL)) { + xmlDtdPtr dtd; + dtd = xmlGetIntSubset(ctxt->myDoc); + if (dtd == NULL) { + ctxt->myDoc->intSubset = + xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", + BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", + BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); + if (ctxt->myDoc->intSubset == NULL) + htmlErrMemory(ctxt); + } + } + ctxt->instate = XML_PARSER_EOF; } + return((xmlParserErrors) ctxt->errNo); } @@ -6287,88 +5303,45 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, /** * htmlCreatePushParserCtxt: - * @sax: a SAX handler - * @user_data: The user data returned on SAX callbacks - * @chunk: a pointer to an array of chars + * @sax: a SAX handler (optional) + * @user_data: The user data returned on SAX callbacks (optional) + * @chunk: a pointer to an array of chars (optional) * @size: number of chars in the array - * @filename: an optional file name or URI - * @enc: an optional encoding + * @filename: only used for error reporting (optional) + * @enc: encoding (deprecated, pass XML_CHAR_ENCODING_NONE) * - * Create a parser context for using the HTML parser in push mode - * The value of @filename is used for fetching external entities - * and error/warning reports. + * Create a parser context for using the HTML parser in push mode. * - * Returns the new parser context or NULL + * Returns the new parser context or NULL if a memory allocation + * failed. */ htmlParserCtxtPtr htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, const char *chunk, int size, const char *filename, xmlCharEncoding enc) { htmlParserCtxtPtr ctxt; - htmlParserInputPtr inputStream; - xmlParserInputBufferPtr buf; - - xmlInitParser(); - - buf = xmlAllocParserInputBuffer(enc); - if (buf == NULL) return(NULL); + htmlParserInputPtr input; + const char *encoding; - ctxt = htmlNewParserCtxt(); - if (ctxt == NULL) { - xmlFreeParserInputBuffer(buf); + ctxt = htmlNewSAXParserCtxt(sax, user_data); + if (ctxt == NULL) return(NULL); - } - if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) - ctxt->charset=XML_CHAR_ENCODING_UTF8; - if (sax != NULL) { - if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) - xmlFree(ctxt->sax); - ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); - if (ctxt->sax == NULL) { - xmlFree(buf); - xmlFree(ctxt); - return(NULL); - } - memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); - if (user_data != NULL) - ctxt->userData = user_data; - } - if (filename == NULL) { - ctxt->directory = NULL; - } else { - ctxt->directory = xmlParserGetDirectory(filename); - } - inputStream = htmlNewInputStream(ctxt); - if (inputStream == NULL) { - xmlFreeParserCtxt(ctxt); - xmlFree(buf); + encoding = xmlGetCharEncodingName(enc); + input = xmlNewPushInput(filename, chunk, size); + if (input == NULL) { + htmlFreeParserCtxt(ctxt); return(NULL); } - if (filename == NULL) - inputStream->filename = NULL; - else - inputStream->filename = (char *) - xmlCanonicPath((const xmlChar *) filename); - inputStream->buf = buf; - xmlBufResetInput(buf->buffer, inputStream); - - inputPush(ctxt, inputStream); - - if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && - (ctxt->input->buf != NULL)) { - size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); - size_t cur = ctxt->input->cur - ctxt->input->base; - - xmlParserInputBufferPush(ctxt->input->buf, size, chunk); - - xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); -#endif + if (xmlCtxtPushInput(ctxt, input) < 0) { + xmlFreeInputStream(input); + xmlFreeParserCtxt(ctxt); + return(NULL); } - ctxt->progressive = 1; + + if (encoding != NULL) + xmlSwitchEncodingName(ctxt, encoding); return(ctxt); } @@ -6381,6 +5354,8 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, * @sax: the SAX handler block * @userData: if using SAX, this pointer will be provided on callbacks. * + * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc. + * * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks * to handle parse events. If sax is NULL, fallback to the default DOM * behavior and return a tree. @@ -6395,25 +5370,20 @@ htmlSAXParseDoc(const xmlChar *cur, const char *encoding, htmlDocPtr ret; htmlParserCtxtPtr ctxt; - xmlInitParser(); - - if (cur == NULL) return(NULL); + if (cur == NULL) + return(NULL); + ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding); + if (ctxt == NULL) + return(NULL); - ctxt = htmlCreateDocParserCtxt(cur, encoding); - if (ctxt == NULL) return(NULL); if (sax != NULL) { - if (ctxt->sax != NULL) xmlFree (ctxt->sax); - ctxt->sax = sax; + *ctxt->sax = *sax; ctxt->userData = userData; } htmlParseDocument(ctxt); ret = ctxt->myDoc; - if (sax != NULL) { - ctxt->sax = NULL; - ctxt->userData = NULL; - } htmlFreeParserCtxt(ctxt); return(ret); @@ -6422,9 +5392,13 @@ htmlSAXParseDoc(const xmlChar *cur, const char *encoding, /** * htmlParseDoc: * @cur: a pointer to an array of xmlChar - * @encoding: a free form C string describing the HTML document encoding, or NULL + * @encoding: the encoding (optional) + * + * DEPRECATED: Use htmlReadDoc. * - * parse an HTML in-memory document and build a tree. + * Parse an HTML in-memory document and build a tree. + * + * This function uses deprecated global parser options. * * Returns the resulting document tree */ @@ -6438,22 +5412,24 @@ htmlParseDoc(const xmlChar *cur, const char *encoding) { /** * htmlCreateFileParserCtxt: * @filename: the filename - * @encoding: a free form C string describing the HTML document encoding, or NULL + * @encoding: optional encoding + * + * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile. + * + * Create a parser context to read from a file. + * + * A non-NULL encoding overrides encoding declarations in the document. * - * Create a parser context for a file content. * Automatic support for ZLIB/Compress compressed document is provided * by default if found at compile-time. * - * Returns the new parser context or NULL + * Returns the new parser context or NULL if a memory allocation failed. */ htmlParserCtxtPtr htmlCreateFileParserCtxt(const char *filename, const char *encoding) { htmlParserCtxtPtr ctxt; - htmlParserInputPtr inputStream; - char *canonicFilename; - /* htmlCharEncoding enc; */ - xmlChar *content, *content_line = (xmlChar *) "charset="; + htmlParserInputPtr input; if (filename == NULL) return(NULL); @@ -6462,39 +5438,16 @@ htmlCreateFileParserCtxt(const char *filename, const char *encoding) if (ctxt == NULL) { return(NULL); } - canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); - if (canonicFilename == NULL) { -#ifdef LIBXML_SAX1_ENABLED - if (xmlDefaultSAXHandler.error != NULL) { - xmlDefaultSAXHandler.error(NULL, "out of memory\n"); - } -#endif - xmlFreeParserCtxt(ctxt); - return(NULL); - } - inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); - xmlFree(canonicFilename); - if (inputStream == NULL) { + input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0); + if (input == NULL) { xmlFreeParserCtxt(ctxt); return(NULL); } - - inputPush(ctxt, inputStream); - - /* set encoding */ - if (encoding) { - size_t l = strlen(encoding); - - if (l < 1000) { - content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1); - if (content) { - strcpy ((char *)content, (char *)content_line); - strcat ((char *)content, (char *)encoding); - htmlCheckEncoding (ctxt, content); - xmlFree (content); - } - } + if (xmlCtxtPushInput(ctxt, input) < 0) { + xmlFreeInputStream(input); + xmlFreeParserCtxt(ctxt); + return(NULL); } return(ctxt); @@ -6503,10 +5456,12 @@ htmlCreateFileParserCtxt(const char *filename, const char *encoding) /** * htmlSAXParseFile: * @filename: the filename - * @encoding: a free form C string describing the HTML document encoding, or NULL + * @encoding: encoding (optional) * @sax: the SAX handler block * @userData: if using SAX, this pointer will be provided on callbacks. * + * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile. + * * parse an HTML file and build a tree. Automatic support for ZLIB/Compress * compressed document is provided by default if found at compile-time. * It use the given SAX function block to handle the parsing callback. @@ -6523,8 +5478,6 @@ htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr s htmlParserCtxtPtr ctxt; htmlSAXHandlerPtr oldsax = NULL; - xmlInitParser(); - ctxt = htmlCreateFileParserCtxt(filename, encoding); if (ctxt == NULL) return(NULL); if (sax != NULL) { @@ -6548,10 +5501,9 @@ htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr s /** * htmlParseFile: * @filename: the filename - * @encoding: a free form C string describing the HTML document encoding, or NULL + * @encoding: encoding (optional) * - * parse an HTML file and build a tree. Automatic support for ZLIB/Compress - * compressed document is provided by default if found at compile-time. + * Parse an HTML file and build a tree. * * Returns the resulting document tree */ @@ -6565,6 +5517,8 @@ htmlParseFile(const char *filename, const char *encoding) { * htmlHandleOmittedElem: * @val: int 0 or 1 * + * DEPRECATED: Use HTML_PARSE_NOIMPLIED + * * Set and return the previous value for handling HTML omitted tags. * * Returns the last value for 0 for no handling, 1 for auto insertion. @@ -6583,113 +5537,64 @@ htmlHandleOmittedElem(int val) { * @parent: HTML parent element * @elt: HTML element * - * Checks whether an HTML element may be a direct child of a parent element. - * Note - doesn't check for deprecated elements + * DEPRECATED: Don't use. * - * Returns 1 if allowed; 0 otherwise. + * Returns 1 */ int -htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { - const char** p ; - - if ( ! elt || ! parent || ! parent->subelts ) - return 0 ; - - for ( p = parent->subelts; *p; ++p ) - if ( !xmlStrcmp((const xmlChar *)*p, elt) ) - return 1 ; - - return 0 ; +htmlElementAllowedHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED, + const xmlChar* elt ATTRIBUTE_UNUSED) { + return(1); } + /** * htmlElementStatusHere: * @parent: HTML parent element * @elt: HTML element * - * Checks whether an HTML element may be a direct child of a parent element. - * and if so whether it is valid or deprecated. + * DEPRECATED: Don't use. * - * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID + * Returns HTML_VALID */ htmlStatus -htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { - if ( ! parent || ! elt ) - return HTML_INVALID ; - if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) - return HTML_INVALID ; - - return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; +htmlElementStatusHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED, + const htmlElemDesc* elt ATTRIBUTE_UNUSED) { + return(HTML_VALID); } + /** * htmlAttrAllowed: * @elt: HTML element * @attr: HTML attribute * @legacy: whether to allow deprecated attributes * - * Checks whether an attribute is valid for an element - * Has full knowledge of Required and Deprecated attributes + * DEPRECATED: Don't use. * - * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID + * Returns HTML_VALID */ htmlStatus -htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { - const char** p ; - - if ( !elt || ! attr ) - return HTML_INVALID ; - - if ( elt->attrs_req ) - for ( p = elt->attrs_req; *p; ++p) - if ( !xmlStrcmp((const xmlChar*)*p, attr) ) - return HTML_REQUIRED ; - - if ( elt->attrs_opt ) - for ( p = elt->attrs_opt; *p; ++p) - if ( !xmlStrcmp((const xmlChar*)*p, attr) ) - return HTML_VALID ; - - if ( legacy && elt->attrs_depr ) - for ( p = elt->attrs_depr; *p; ++p) - if ( !xmlStrcmp((const xmlChar*)*p, attr) ) - return HTML_DEPRECATED ; - - return HTML_INVALID ; +htmlAttrAllowed(const htmlElemDesc* elt ATTRIBUTE_UNUSED, + const xmlChar* attr ATTRIBUTE_UNUSED, + int legacy ATTRIBUTE_UNUSED) { + return(HTML_VALID); } + /** * htmlNodeStatus: * @node: an htmlNodePtr in a tree * @legacy: whether to allow deprecated elements (YES is faster here * for Element nodes) * - * Checks whether the tree node is valid. Experimental (the author - * only uses the HTML enhancements in a SAX parser) + * DEPRECATED: Don't use. * - * Return: for Element nodes, a return from htmlElementAllowedHere (if - * legacy allowed) or htmlElementStatusHere (otherwise). - * for Attribute nodes, a return from htmlAttrAllowed - * for other nodes, HTML_NA (no checks performed) + * Returns HTML_VALID */ htmlStatus -htmlNodeStatus(const htmlNodePtr node, int legacy) { - if ( ! node ) - return HTML_INVALID ; - - switch ( node->type ) { - case XML_ELEMENT_NODE: - return legacy - ? ( htmlElementAllowedHere ( - htmlTagLookup(node->parent->name) , node->name - ) ? HTML_VALID : HTML_INVALID ) - : htmlElementStatusHere( - htmlTagLookup(node->parent->name) , - htmlTagLookup(node->name) ) - ; - case XML_ATTRIBUTE_NODE: - return htmlAttrAllowed( - htmlTagLookup(node->parent->name) , node->name, legacy) ; - default: return HTML_NA ; - } +htmlNodeStatus(htmlNodePtr node ATTRIBUTE_UNUSED, + int legacy ATTRIBUTE_UNUSED) { + return(HTML_VALID); } + /************************************************************************ * * * New set (2.6.0) of simpler and more flexible APIs * @@ -6722,10 +5627,9 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt) if (ctxt == NULL) return; - xmlInitParser(); dict = ctxt->dict; - while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ + while ((input = xmlCtxtPopInput(ctxt)) != NULL) { /* Non consuming */ xmlFreeInputStream(input); } ctxt->inputNr = 0; @@ -6746,16 +5650,22 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt) ctxt->nameNr = 0; ctxt->name = NULL; + ctxt->nsNr = 0; + DICT_FREE(ctxt->version); ctxt->version = NULL; DICT_FREE(ctxt->encoding); ctxt->encoding = NULL; - DICT_FREE(ctxt->directory); - ctxt->directory = NULL; DICT_FREE(ctxt->extSubURI); ctxt->extSubURI = NULL; DICT_FREE(ctxt->extSubSystem); ctxt->extSubSystem = NULL; + + if (ctxt->directory != NULL) { + xmlFree(ctxt->directory); + ctxt->directory = NULL; + } + if (ctxt->myDoc != NULL) xmlFreeDoc(ctxt->myDoc); ctxt->myDoc = NULL; @@ -6763,24 +5673,23 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt) ctxt->standalone = -1; ctxt->hasExternalSubset = 0; ctxt->hasPErefs = 0; - ctxt->html = 1; - ctxt->external = 0; + ctxt->html = INSERT_INITIAL; ctxt->instate = XML_PARSER_START; - ctxt->token = 0; ctxt->wellFormed = 1; ctxt->nsWellFormed = 1; ctxt->disableSAX = 0; ctxt->valid = 1; ctxt->vctxt.userData = ctxt; + ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT; ctxt->vctxt.error = xmlParserValidityError; ctxt->vctxt.warning = xmlParserValidityWarning; ctxt->record_info = 0; ctxt->checkIndex = 0; + ctxt->endCheckState = 0; ctxt->inSubset = 0; ctxt->errNo = XML_ERR_OK; ctxt->depth = 0; - ctxt->charset = XML_CHAR_ENCODING_NONE; ctxt->catalogs = NULL; xmlInitNodeInfoSeq(&ctxt->node_seq); @@ -6792,6 +5701,168 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt) xmlHashFree(ctxt->attsSpecial, NULL); ctxt->attsSpecial = NULL; } + + ctxt->nbErrors = 0; + ctxt->nbWarnings = 0; + if (ctxt->lastError.code != XML_ERR_OK) + xmlResetError(&ctxt->lastError); +} + +static int +htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask) +{ + int allMask; + + if (ctxt == NULL) + return(-1); + + allMask = HTML_PARSE_RECOVER | + HTML_PARSE_HTML5 | + HTML_PARSE_NODEFDTD | + HTML_PARSE_NOERROR | + HTML_PARSE_NOWARNING | + HTML_PARSE_PEDANTIC | + HTML_PARSE_NOBLANKS | + HTML_PARSE_NONET | + HTML_PARSE_NOIMPLIED | + HTML_PARSE_COMPACT | + HTML_PARSE_HUGE | + HTML_PARSE_IGNORE_ENC | + HTML_PARSE_BIG_LINES; + + ctxt->options = (ctxt->options & keepMask) | (options & allMask); + + /* + * For some options, struct members are historically the source + * of truth. See xmlCtxtSetOptionsInternal. + */ + ctxt->keepBlanks = (options & HTML_PARSE_NOBLANKS) ? 0 : 1; + + /* + * Changing SAX callbacks is a bad idea. This should be fixed. + */ + if (options & HTML_PARSE_NOBLANKS) { + ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; + } + if (options & HTML_PARSE_HUGE) { + if (ctxt->dict != NULL) + xmlDictSetLimit(ctxt->dict, 0); + } + + /* + * It would be useful to allow this feature. + */ + ctxt->dictNames = 0; + + ctxt->linenumbers = 1; + + /* + * Allow XML_PARSE_NOENT which many users set on the HTML parser. + */ + return(options & ~allMask & ~XML_PARSE_NOENT); +} + +/** + * htmlCtxtSetOptions: + * @ctxt: an HTML parser context + * @options: a bitmask of xmlParserOption values + * + * Applies the options to the parser context. Unset options are + * cleared. + * + * Available since 2.14.0. With older versions, you can use + * htmlCtxtUseOptions. + * + * HTML_PARSE_RECOVER + * + * No effect as of 2.14.0. + * + * HTML_PARSE_HTML5 + * + * Make the tokenizer emit a SAX callback for each token. This results + * in unbalanced invocations of startElement and endElement. + * + * For now, this is only usable with custom SAX callbacks. + * + * HTML_PARSE_NODEFDTD + * + * Do not default to a doctype if none was found. + * + * HTML_PARSE_NOERROR + * + * Disable error and warning reports to the error handlers. + * Errors are still accessible with xmlCtxtGetLastError. + * + * HTML_PARSE_NOWARNING + * + * Disable warning reports. + * + * HTML_PARSE_PEDANTIC + * + * No effect. + * + * HTML_PARSE_NOBLANKS + * + * Remove some text nodes containing only whitespace from the + * result document. Which nodes are removed depends on a conservative + * heuristic. The reindenting feature of the serialization code relies + * on this option to be set when parsing. Use of this option is + * DISCOURAGED. + * + * HTML_PARSE_NONET + * + * No effect. + * + * HTML_PARSE_NOIMPLIED + * + * Do not add implied html, head or body elements. + * + * HTML_PARSE_COMPACT + * + * Store small strings directly in the node struct to save + * memory. + * + * HTML_PARSE_HUGE + * + * Relax some internal limits. + * + * Available since 2.14.0. Use XML_PARSE_HUGE works with older + * versions. + * + * Maximum size of text nodes, tags, comments, CDATA sections + * + * normal: 10M + * huge: 1B + * + * Maximum size of names, system literals, pubid literals + * + * normal: 50K + * huge: 10M + * + * Maximum nesting depth of elements + * + * normal: 256 + * huge: 2048 + * + * HTML_PARSE_IGNORE_ENC + * + * Ignore the encoding in the HTML declaration. This option is + * mostly unneeded these days. The only effect is to enforce + * UTF-8 decoding of ASCII-like data. + * + * HTML_PARSE_BIG_LINES + * + * Enable reporting of line numbers larger than 65535. + * + * Available since 2.14.0. + * + * Returns 0 in case of success, the set of unknown or unimplemented options + * in case of error. + */ +int +htmlCtxtSetOptions(xmlParserCtxtPtr ctxt, int options) +{ + return(htmlCtxtSetOptionsInternal(ctxt, options, 0)); } /** @@ -6799,7 +5870,19 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt) * @ctxt: an HTML parser context * @options: a combination of htmlParserOption(s) * - * Applies the options to the parser context + * DEPRECATED: Use htmlCtxtSetOptions. + * + * Applies the options to the parser context. The following options + * are never cleared and can only be enabled: + * + * HTML_PARSE_NODEFDTD + * HTML_PARSE_NOERROR + * HTML_PARSE_NOWARNING + * HTML_PARSE_NOIMPLIED + * HTML_PARSE_COMPACT + * HTML_PARSE_HUGE + * HTML_PARSE_IGNORE_ENC + * HTML_PARSE_BIG_LINES * * Returns 0 in case of success, the set of unknown or unimplemented options * in case of error. @@ -6807,320 +5890,305 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt) int htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) { - if (ctxt == NULL) - return(-1); + int keepMask; - if (options & HTML_PARSE_NOWARNING) { - ctxt->sax->warning = NULL; - ctxt->vctxt.warning = NULL; - options -= XML_PARSE_NOWARNING; - ctxt->options |= XML_PARSE_NOWARNING; - } - if (options & HTML_PARSE_NOERROR) { - ctxt->sax->error = NULL; - ctxt->vctxt.error = NULL; - ctxt->sax->fatalError = NULL; - options -= XML_PARSE_NOERROR; - ctxt->options |= XML_PARSE_NOERROR; - } - if (options & HTML_PARSE_PEDANTIC) { - ctxt->pedantic = 1; - options -= XML_PARSE_PEDANTIC; - ctxt->options |= XML_PARSE_PEDANTIC; - } else - ctxt->pedantic = 0; - if (options & XML_PARSE_NOBLANKS) { - ctxt->keepBlanks = 0; - ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; - options -= XML_PARSE_NOBLANKS; - ctxt->options |= XML_PARSE_NOBLANKS; - } else - ctxt->keepBlanks = 1; - if (options & HTML_PARSE_RECOVER) { - ctxt->recovery = 1; - options -= HTML_PARSE_RECOVER; - } else - ctxt->recovery = 0; - if (options & HTML_PARSE_COMPACT) { - ctxt->options |= HTML_PARSE_COMPACT; - options -= HTML_PARSE_COMPACT; - } - if (options & XML_PARSE_HUGE) { - ctxt->options |= XML_PARSE_HUGE; - options -= XML_PARSE_HUGE; - } - if (options & HTML_PARSE_NODEFDTD) { - ctxt->options |= HTML_PARSE_NODEFDTD; - options -= HTML_PARSE_NODEFDTD; - } - if (options & HTML_PARSE_IGNORE_ENC) { - ctxt->options |= HTML_PARSE_IGNORE_ENC; - options -= HTML_PARSE_IGNORE_ENC; - } - if (options & HTML_PARSE_NOIMPLIED) { - ctxt->options |= HTML_PARSE_NOIMPLIED; - options -= HTML_PARSE_NOIMPLIED; - } - ctxt->dictNames = 0; - return (options); + /* + * For historic reasons, some options can only be enabled. + */ + keepMask = HTML_PARSE_NODEFDTD | + HTML_PARSE_NOERROR | + HTML_PARSE_NOWARNING | + HTML_PARSE_NOIMPLIED | + HTML_PARSE_COMPACT | + HTML_PARSE_HUGE | + HTML_PARSE_IGNORE_ENC | + HTML_PARSE_BIG_LINES; + + return(htmlCtxtSetOptionsInternal(ctxt, options, keepMask)); } /** - * htmlDoRead: + * htmlCtxtParseDocument: * @ctxt: an HTML parser context - * @URL: the base URL to use for the document - * @encoding: the document encoding, or NULL - * @options: a combination of htmlParserOption(s) - * @reuse: keep the context for reuse + * @input: parser input + * + * Parse an HTML document and return the resulting document tree. * - * Common front-end for the htmlRead functions + * Available since 2.13.0. * * Returns the resulting document tree or NULL */ -static htmlDocPtr -htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, - int options, int reuse) +htmlDocPtr +htmlCtxtParseDocument(htmlParserCtxtPtr ctxt, xmlParserInputPtr input) { htmlDocPtr ret; - htmlCtxtUseOptions(ctxt, options); - ctxt->html = 1; - if (encoding != NULL) { - xmlCharEncodingHandlerPtr hdlr; - - hdlr = xmlFindCharEncodingHandler(encoding); - if (hdlr != NULL) { - xmlSwitchToEncoding(ctxt, hdlr); - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); - } + if ((ctxt == NULL) || (input == NULL)) { + xmlFatalErr(ctxt, XML_ERR_ARGUMENT, NULL); + xmlFreeInputStream(input); + return(NULL); } - if ((URL != NULL) && (ctxt->input != NULL) && - (ctxt->input->filename == NULL)) - ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); - htmlParseDocument(ctxt); - ret = ctxt->myDoc; - ctxt->myDoc = NULL; - if (!reuse) { - if ((ctxt->dictNames) && - (ret != NULL) && - (ret->dict == ctxt->dict)) - ctxt->dict = NULL; - xmlFreeParserCtxt(ctxt); + + /* assert(ctxt->inputNr == 0); */ + while (ctxt->inputNr > 0) + xmlFreeInputStream(xmlCtxtPopInput(ctxt)); + + if (xmlCtxtPushInput(ctxt, input) < 0) { + xmlFreeInputStream(input); + return(NULL); } - return (ret); + + ctxt->html = INSERT_INITIAL; + htmlParseDocument(ctxt); + + ret = xmlCtxtGetDocument(ctxt); + + /* assert(ctxt->inputNr == 1); */ + while (ctxt->inputNr > 0) + xmlFreeInputStream(xmlCtxtPopInput(ctxt)); + + return(ret); } /** * htmlReadDoc: - * @cur: a pointer to a zero terminated string - * @URL: the base URL to use for the document - * @encoding: the document encoding, or NULL - * @options: a combination of htmlParserOption(s) + * @str: a pointer to a zero terminated string + * @url: only used for error reporting (optoinal) + * @encoding: the document encoding (optional) + * @options: a combination of htmlParserOptions * - * parse an XML in-memory document and build a tree. + * Convenience function to parse an HTML document from a zero-terminated + * string. * - * Returns the resulting document tree + * See htmlCtxtReadDoc for details. + * + * Returns the resulting document tree. */ htmlDocPtr -htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) +htmlReadDoc(const xmlChar *str, const char *url, const char *encoding, + int options) { htmlParserCtxtPtr ctxt; + xmlParserInputPtr input; + htmlDocPtr doc = NULL; - if (cur == NULL) - return (NULL); - - xmlInitParser(); - ctxt = htmlCreateDocParserCtxt(cur, NULL); + ctxt = htmlNewParserCtxt(); if (ctxt == NULL) - return (NULL); - return (htmlDoRead(ctxt, URL, encoding, options, 0)); + return(NULL); + + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str, encoding, + XML_INPUT_BUF_STATIC); + + if (input != NULL) + doc = htmlCtxtParseDocument(ctxt, input); + + htmlFreeParserCtxt(ctxt); + return(doc); } /** * htmlReadFile: * @filename: a file or URL - * @encoding: the document encoding, or NULL - * @options: a combination of htmlParserOption(s) + * @encoding: the document encoding (optional) + * @options: a combination of htmlParserOptions * - * parse an XML file from the filesystem or the network. + * Convenience function to parse an HTML file from the filesystem, + * the network or a global user-defined resource loader. * - * Returns the resulting document tree + * See htmlCtxtReadFile for details. + * + * Returns the resulting document tree. */ htmlDocPtr htmlReadFile(const char *filename, const char *encoding, int options) { htmlParserCtxtPtr ctxt; + xmlParserInputPtr input; + htmlDocPtr doc = NULL; - xmlInitParser(); - ctxt = htmlCreateFileParserCtxt(filename, encoding); + ctxt = htmlNewParserCtxt(); if (ctxt == NULL) - return (NULL); - return (htmlDoRead(ctxt, NULL, NULL, options, 0)); + return(NULL); + + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0); + + if (input != NULL) + doc = htmlCtxtParseDocument(ctxt, input); + + htmlFreeParserCtxt(ctxt); + return(doc); } /** * htmlReadMemory: * @buffer: a pointer to a char array * @size: the size of the array - * @URL: the base URL to use for the document + * @url: only used for error reporting (optional) * @encoding: the document encoding, or NULL * @options: a combination of htmlParserOption(s) * - * parse an XML in-memory document and build a tree. + * Convenience function to parse an HTML document from memory. + * The input buffer must not contain any terminating null bytes. + * + * See htmlCtxtReadMemory for details. * * Returns the resulting document tree */ htmlDocPtr -htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) +htmlReadMemory(const char *buffer, int size, const char *url, + const char *encoding, int options) { htmlParserCtxtPtr ctxt; + xmlParserInputPtr input; + htmlDocPtr doc = NULL; - xmlInitParser(); - ctxt = xmlCreateMemoryParserCtxt(buffer, size); + if (size < 0) + return(NULL); + + ctxt = htmlNewParserCtxt(); if (ctxt == NULL) - return (NULL); - htmlDefaultSAXHandlerInit(); - if (ctxt->sax != NULL) - memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); - return (htmlDoRead(ctxt, URL, encoding, options, 0)); + return(NULL); + + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding, + XML_INPUT_BUF_STATIC); + + if (input != NULL) + doc = htmlCtxtParseDocument(ctxt, input); + + htmlFreeParserCtxt(ctxt); + return(doc); } /** * htmlReadFd: * @fd: an open file descriptor - * @URL: the base URL to use for the document + * @url: only used for error reporting (optional) * @encoding: the document encoding, or NULL - * @options: a combination of htmlParserOption(s) + * @options: a combination of htmlParserOptions + * + * Convenience function to parse an HTML document from a + * file descriptor. * - * parse an HTML from a file descriptor and build a tree. * NOTE that the file descriptor will not be closed when the - * reader is closed or reset. + * context is freed or reset. + * + * See htmlCtxtReadFd for details. * * Returns the resulting document tree */ htmlDocPtr -htmlReadFd(int fd, const char *URL, const char *encoding, int options) +htmlReadFd(int fd, const char *url, const char *encoding, int options) { htmlParserCtxtPtr ctxt; - xmlParserInputBufferPtr input; - htmlParserInputPtr stream; - - if (fd < 0) - return (NULL); + xmlParserInputPtr input; + htmlDocPtr doc = NULL; - xmlInitParser(); - input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); - if (input == NULL) - return (NULL); - input->closecallback = NULL; ctxt = htmlNewParserCtxt(); - if (ctxt == NULL) { - xmlFreeParserInputBuffer(input); - return (NULL); - } - stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); - if (stream == NULL) { - xmlFreeParserInputBuffer(input); - htmlFreeParserCtxt(ctxt); - return (NULL); - } - inputPush(ctxt, stream); - return (htmlDoRead(ctxt, URL, encoding, options, 0)); + if (ctxt == NULL) + return(NULL); + + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromFd(ctxt, url, fd, encoding, 0); + + if (input != NULL) + doc = htmlCtxtParseDocument(ctxt, input); + + htmlFreeParserCtxt(ctxt); + return(doc); } /** * htmlReadIO: * @ioread: an I/O read function - * @ioclose: an I/O close function + * @ioclose: an I/O close function (optional) * @ioctx: an I/O handler - * @URL: the base URL to use for the document - * @encoding: the document encoding, or NULL + * @url: only used for error reporting (optional) + * @encoding: the document encoding (optional) * @options: a combination of htmlParserOption(s) * - * parse an HTML document from I/O functions and source and build a tree. + * Convenience function to parse an HTML document from I/O functions + * and context. + * + * See htmlCtxtReadIO for details. * * Returns the resulting document tree */ htmlDocPtr htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, - void *ioctx, const char *URL, const char *encoding, int options) + void *ioctx, const char *url, const char *encoding, int options) { htmlParserCtxtPtr ctxt; - xmlParserInputBufferPtr input; - xmlParserInputPtr stream; - - if (ioread == NULL) - return (NULL); - xmlInitParser(); + xmlParserInputPtr input; + htmlDocPtr doc = NULL; - input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, - XML_CHAR_ENCODING_NONE); - if (input == NULL) { - if (ioclose != NULL) - ioclose(ioctx); - return (NULL); - } ctxt = htmlNewParserCtxt(); - if (ctxt == NULL) { - xmlFreeParserInputBuffer(input); - return (NULL); - } - stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); - if (stream == NULL) { - xmlFreeParserInputBuffer(input); - xmlFreeParserCtxt(ctxt); + if (ctxt == NULL) return (NULL); - } - inputPush(ctxt, stream); - return (htmlDoRead(ctxt, URL, encoding, options, 0)); + + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromIO(ctxt, url, ioread, ioclose, ioctx, + encoding, 0); + + if (input != NULL) + doc = htmlCtxtParseDocument(ctxt, input); + + htmlFreeParserCtxt(ctxt); + return(doc); } /** * htmlCtxtReadDoc: * @ctxt: an HTML parser context - * @cur: a pointer to a zero terminated string - * @URL: the base URL to use for the document - * @encoding: the document encoding, or NULL - * @options: a combination of htmlParserOption(s) + * @str: a pointer to a zero terminated string + * @URL: only used for error reporting (optional) + * @encoding: the document encoding (optional) + * @options: a combination of htmlParserOptions + * + * Parse an HTML in-memory document and build a tree. * - * parse an XML in-memory document and build a tree. - * This reuses the existing @ctxt parser context + * See htmlCtxtUseOptions for details. * * Returns the resulting document tree */ htmlDocPtr -htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, - const char *URL, const char *encoding, int options) +htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str, + const char *URL, const char *encoding, int options) { - xmlParserInputPtr stream; + xmlParserInputPtr input; - if (cur == NULL) - return (NULL); if (ctxt == NULL) return (NULL); - xmlInitParser(); htmlCtxtReset(ctxt); + htmlCtxtUseOptions(ctxt, options); - stream = xmlNewStringInputStream(ctxt, cur); - if (stream == NULL) { - return (NULL); - } - inputPush(ctxt, stream); - return (htmlDoRead(ctxt, URL, encoding, options, 1)); + input = xmlCtxtNewInputFromString(ctxt, URL, (const char *) str, + encoding, 0); + if (input == NULL) + return(NULL); + + return(htmlCtxtParseDocument(ctxt, input)); } /** * htmlCtxtReadFile: * @ctxt: an HTML parser context * @filename: a file or URL - * @encoding: the document encoding, or NULL - * @options: a combination of htmlParserOption(s) + * @encoding: the document encoding (optional) + * @options: a combination of htmlParserOptions * - * parse an XML file from the filesystem or the network. - * This reuses the existing @ctxt parser context + * Parse an HTML file from the filesystem, the network or a + * user-defined resource loader. + * + * See htmlCtxtUseOptions for details. * * Returns the resulting document tree */ @@ -7128,22 +6196,19 @@ htmlDocPtr htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, const char *encoding, int options) { - xmlParserInputPtr stream; + xmlParserInputPtr input; - if (filename == NULL) - return (NULL); if (ctxt == NULL) return (NULL); - xmlInitParser(); htmlCtxtReset(ctxt); + htmlCtxtUseOptions(ctxt, options); - stream = xmlLoadExternalEntity(filename, NULL, ctxt); - if (stream == NULL) { - return (NULL); - } - inputPush(ctxt, stream); - return (htmlDoRead(ctxt, NULL, encoding, options, 1)); + input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0); + if (input == NULL) + return(NULL); + + return(htmlCtxtParseDocument(ctxt, input)); } /** @@ -7151,12 +6216,14 @@ htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, * @ctxt: an HTML parser context * @buffer: a pointer to a char array * @size: the size of the array - * @URL: the base URL to use for the document - * @encoding: the document encoding, or NULL - * @options: a combination of htmlParserOption(s) + * @URL: only used for error reporting (optional) + * @encoding: the document encoding (optinal) + * @options: a combination of htmlParserOptions * - * parse an XML in-memory document and build a tree. - * This reuses the existing @ctxt parser context + * Parse an HTML in-memory document and build a tree. The input buffer must + * not contain any terminating null bytes. + * + * See htmlCtxtUseOptions for details. * * Returns the resulting document tree */ @@ -7164,42 +6231,36 @@ htmlDocPtr htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, const char *URL, const char *encoding, int options) { - xmlParserInputBufferPtr input; - xmlParserInputPtr stream; + xmlParserInputPtr input; - if (ctxt == NULL) - return (NULL); - if (buffer == NULL) + if ((ctxt == NULL) || (size < 0)) return (NULL); - xmlInitParser(); htmlCtxtReset(ctxt); + htmlCtxtUseOptions(ctxt, options); - input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); - if (input == NULL) { - return(NULL); - } - - stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); - if (stream == NULL) { - xmlFreeParserInputBuffer(input); - return(NULL); - } + input = xmlCtxtNewInputFromMemory(ctxt, URL, buffer, size, encoding, + XML_INPUT_BUF_STATIC); + if (input == NULL) + return(NULL); - inputPush(ctxt, stream); - return (htmlDoRead(ctxt, URL, encoding, options, 1)); + return(htmlCtxtParseDocument(ctxt, input)); } /** * htmlCtxtReadFd: * @ctxt: an HTML parser context * @fd: an open file descriptor - * @URL: the base URL to use for the document - * @encoding: the document encoding, or NULL - * @options: a combination of htmlParserOption(s) + * @URL: only used for error reporting (optional) + * @encoding: the document encoding (optinal) + * @options: a combination of htmlParserOptions * - * parse an XML from a file descriptor and build a tree. - * This reuses the existing @ctxt parser context + * Parse an HTML from a file descriptor and build a tree. + * + * See htmlCtxtUseOptions for details. + * + * NOTE that the file descriptor will not be closed when the + * context is freed or reset. * * Returns the resulting document tree */ @@ -7207,28 +6268,19 @@ htmlDocPtr htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, const char *URL, const char *encoding, int options) { - xmlParserInputBufferPtr input; - xmlParserInputPtr stream; + xmlParserInputPtr input; - if (fd < 0) - return (NULL); if (ctxt == NULL) - return (NULL); - xmlInitParser(); + return(NULL); htmlCtxtReset(ctxt); + htmlCtxtUseOptions(ctxt, options); - - input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); + input = xmlCtxtNewInputFromFd(ctxt, URL, fd, encoding, 0); if (input == NULL) - return (NULL); - stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); - if (stream == NULL) { - xmlFreeParserInputBuffer(input); - return (NULL); - } - inputPush(ctxt, stream); - return (htmlDoRead(ctxt, URL, encoding, options, 1)); + return(NULL); + + return(htmlCtxtParseDocument(ctxt, input)); } /** @@ -7241,8 +6293,9 @@ htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, * @encoding: the document encoding, or NULL * @options: a combination of htmlParserOption(s) * - * parse an HTML document from I/O functions and source and build a tree. - * This reuses the existing @ctxt parser context + * Parse an HTML document from I/O functions and source and build a tree. + * + * See htmlCtxtUseOptions for details. * * Returns the resulting document tree */ @@ -7252,33 +6305,20 @@ htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, const char *URL, const char *encoding, int options) { - xmlParserInputBufferPtr input; - xmlParserInputPtr stream; + xmlParserInputPtr input; - if (ioread == NULL) - return (NULL); if (ctxt == NULL) return (NULL); - xmlInitParser(); htmlCtxtReset(ctxt); + htmlCtxtUseOptions(ctxt, options); - input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, - XML_CHAR_ENCODING_NONE); - if (input == NULL) { - if (ioclose != NULL) - ioclose(ioctx); - return (NULL); - } - stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); - if (stream == NULL) { - xmlFreeParserInputBuffer(input); - return (NULL); - } - inputPush(ctxt, stream); - return (htmlDoRead(ctxt, URL, encoding, options, 1)); + input = xmlCtxtNewInputFromIO(ctxt, URL, ioread, ioclose, ioctx, + encoding, 0); + if (input == NULL) + return(NULL); + + return(htmlCtxtParseDocument(ctxt, input)); } -#define bottom_HTMLparser -#include "elfgcchack.h" #endif /* LIBXML_HTML_ENABLED */ diff --git a/contrib/libs/libxml/HTMLtree.c b/contrib/libs/libxml/HTMLtree.c index 7a2b85583892..3a013a33c561 100644 --- a/contrib/libs/libxml/HTMLtree.c +++ b/contrib/libs/libxml/HTMLtree.c @@ -12,25 +12,21 @@ #ifdef LIBXML_HTML_ENABLED #include /* for memset() only ! */ - -#ifdef HAVE_CTYPE_H #include -#endif -#ifdef HAVE_STDLIB_H #include -#endif #include #include #include #include -#include #include #include -#include #include -#include "buf.h" +#include "private/buf.h" +#include "private/error.h" +#include "private/io.h" +#include "private/save.h" /************************************************************************ * * @@ -304,7 +300,7 @@ htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { * output as