/* parser.c - HTML Parser (c) 1998-2001 (W3C) MIT, INRIA, Keio University See tidy.c for the copyright notice. CVS Info : $Author: creitzel $ $Date: 2001/10/17 15:20:24 $ $Revision: 1.32 $ */ #include "platform.h" /* platform independent stuff */ #include "html.h" /* to pull in definition of nodes */ int SeenBodyEndTag; /* could be moved into lexer structure */ Bool CheckNodeIntegrity(Node *node) { Node *child; Bool found = no; if (node->prev) { if (node->prev->next != node) return no; } if (node->next) { if (node->next->prev != node) return no; } if (node->parent) { if (node->prev == null && node->parent->content != node) return no; if (node->next == null && node->parent->last != node) return no; for (child = node->parent->content; child; child = child->next) if (child == node) { found = yes; break; } if (!found) return no; } for (child = node->content; child; child = child->next) if (!CheckNodeIntegrity(child)) return no; return yes; } /* used to determine how attributes without values should be printed this was introduced to deal with user defined tags e.g. Cold Fusion */ Bool IsNewNode(Node *node) { if (node && node->tag) { return (node->tag->model & CM_NEW); } return yes; } void CoerceNode(Lexer *lexer, Node *node, Dict *tag) { Node *tmp = InferredTag(lexer, tag->name); ReportWarning(lexer, node, tmp, OBSOLETE_ELEMENT); MemFree(tmp->element); MemFree(tmp); MemFree(node->element); node->was = node->tag; node->tag = tag; node->type = StartTag; node->implicit = yes; node->element = wstrdup(tag->name); } /* extract a node and its children from a markup tree */ void RemoveNode(Node *node) { if (node->prev) node->prev->next = node->next; if (node->next) node->next->prev = node->prev; if (node->parent) { if (node->parent->content == node) node->parent->content = node->next; if (node->parent->last == node) node->parent->last = node->prev; } node->parent = node->prev = node->next = null; } /* remove node from markup tree and discard it */ Node *DiscardElement(Node *element) { Node *next = null; if (element) { next = element->next; RemoveNode(element); FreeNode(element); } return next; } /* insert node into markup tree */ void InsertNodeAtStart(Node *element, Node *node) { node->parent = element; if (element->content == null) element->last = node; #if 1 else element->content->prev = node; // AQ added 13 Apr 2000 #endif node->next = element->content; node->prev = null; element->content = node; } /* insert node into markup tree */ void InsertNodeAtEnd(Node *element, Node *node) { node->parent = element; node->prev = element->last; if (element->last != null) element->last->next = node; else element->content = node; element->last = node; } /* insert node into markup tree in pace of element which is moved to become the child of the node */ static void InsertNodeAsParent(Node *element, Node *node) { node->content = element; node->last = element; node->parent = element->parent; element->parent = node; if (node->parent->content == element) node->parent->content = node; if (node->parent->last == element) node->parent->last = node; node->prev = element->prev; element->prev = null; if (node->prev) node->prev->next = node; node->next = element->next; element->next = null; if (node->next) node->next->prev = node; } /* insert node into markup tree before element */ void InsertNodeBeforeElement(Node *element, Node *node) { Node *parent; parent = element->parent; node->parent = parent; node->next = element; node->prev = element->prev; element->prev = node; if (node->prev) node->prev->next = node; if (parent->content == element) parent->content = node; } /* insert node into markup tree after element */ void InsertNodeAfterElement(Node *element, Node *node) { Node *parent; parent = element->parent; node->parent = parent; // AQ - 13Jan2000 fix for parent == null if (parent != null && parent->last == element) parent->last = node; else { node->next = element->next; // AQ - 13Jan2000 fix for node->next == null if (node->next != null) node->next->prev = node; } element->next = node; node->prev = element; } static Bool CanPrune(Node *element) { if (element->type == TextNode) return yes; if (element->content) return no; if (element->tag == tag_a && element->attributes != null) return no; if (element->tag == tag_p && !DropEmptyParas) return no; if (element->tag == null) return no; if (element->tag->model & CM_ROW) return no; if (element->tag == tag_applet) return no; if (element->tag == tag_object) return no; /* #433359 - fix by Randy Waki 12 Mar 01 */ if (element->tag == tag_iframe) return no; if ( element->attributes != null && (GetAttrByName(element, "id") || GetAttrByName(element, "name")) ) return no; return yes; } static void TrimEmptyElement(Lexer *lexer, Node *element) { if (CanPrune(element)) { if (element->type != TextNode) ReportWarning(lexer, element, null, TRIM_EMPTY_ELEMENT); DiscardElement(element); } else if (element->tag == tag_p && element->content == null) { /* replace
byhello world to
hello world Trims initial space, by moving it before the start tag, or if this element is the first in parent's content, then by discarding the space */ static void TrimInitialSpace(Lexer *lexer, Node *element, Node *text) { Node *prev, *node; if (text->type == TextNode && lexer->lexbuf[text->start] == ' ' && text->start < text->end) /* #427677 - fix by Gary Peskin 31 Oct 00 */ { if ((element->tag->model & CM_INLINE) && !(element->tag->model & CM_FIELD) && element->parent->content != element) { prev = element->prev; if (prev && prev->type == TextNode) { if (lexer->lexbuf[prev->end - 1] != ' ') lexer->lexbuf[(prev->end)++] = ' '; ++(element->start); } else /* create new node */ { node = NewNode(); node->start = (element->start)++; node->end = element->start; lexer->lexbuf[node->start] = ' '; node->prev = prev; if (prev) prev->next = node; node->next = element; element->prev = node; node->parent = element->parent; } } /* discard the space in current node */ ++(text->start); } } /* Move initial and trailing space out. This routine maps: hello world to hello world and hello world to hello world */ static void TrimSpaces(Lexer *lexer, Node *element) { Node *text = element->content; if (text && text->type == TextNode && element->tag != tag_pre) TrimInitialSpace(lexer, element, text); text = element->last; if (text && text->type == TextNode) TrimTrailingSpace(lexer, element, text); } static Bool DescendantOf(Node *element, Dict *tag) { Node *parent; for (parent = element->parent; parent != null; parent = parent->parent) { if (parent->tag == tag) return yes; } return no; } static Bool InsertMisc(Node *element, Node *node) { if (node->type == CommentTag || node->type == ProcInsTag || node->type == CDATATag || node->type == SectionTag || node->type == AspTag || node->type == JsteTag || node->type == PhpTag || node->type == XmlDecl) { InsertNodeAtEnd(element, node); return yes; } return no; } static void ParseTag(Lexer *lexer, Node *node, uint mode) { /* Fix by GLP 2000-12-21. Need to reset insertspace if this is both a non-inline and empty tag (base, link, meta, isindex, hr, area). */ if (node->tag->model & CM_EMPTY) { lexer->waswhite = no; if (node->tag->parser == null) return; } else if (!(node->tag->model & CM_INLINE)) lexer->insertspace = no; if (node->tag->parser == null || node->type == StartEndTag) return; (*node->tag->parser)(lexer, node, mode); } /* the doctype has been found after other tags, and needs moving to before the html element */ static void InsertDocType(Lexer *lexer, Node *element, Node *doctype) { ReportWarning(lexer, element, doctype, DOCTYPE_AFTER_TAGS); while (element->tag != tag_html) element = element->parent; InsertNodeBeforeElement(element, doctype); } /* duplicate name attribute as an id and check if id and name match */ void FixId(Lexer *lexer, Node *node) { AttVal *name = GetAttrByName(node, "name"); AttVal *id = GetAttrByName(node, "id"); if (name) { if (id) { if ((name->value != null) && (id->value != null)) if (wstrcmp(id->value, name->value) != 0) ReportAttrError(lexer, node, name, ID_NAME_MISMATCH); } else if (XmlOut) AddAttribute(node, "id", name->value); } } /* move node to the head, where element is used as starting point in hunt for head. normally called during parsing */ static void MoveToHead(Lexer *lexer, Node *element, Node *node) { Node *head; RemoveNode(node); /* make sure that node is isolated */ if (node->type == StartTag || node->type == StartEndTag) { ReportWarning(lexer, element, node, TAG_NOT_ALLOWED_IN); while (element->tag != tag_html) element = element->parent; for (head = element->content; head; head = head->next) { if (head->tag == tag_head) { InsertNodeAtEnd(head, node); break; } } if (node->tag->parser) ParseTag(lexer, node, IgnoreWhitespace); } else { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); } } /* element is node created by the lexer upon seeing the start tag, or by the parser when the start tag is inferred */ void ParseBlock(Lexer *lexer, Node *element, uint mode) { Node *node, *parent; Bool checkstack; uint istackbase; checkstack = yes; if (element->tag->model & CM_EMPTY) return; if (element->tag == tag_form && DescendantOf(element, tag_form)) ReportWarning(lexer, element, null, ILLEGAL_NESTING); /* InlineDup() asks the lexer to insert inline emphasis tags currently pushed on the istack, but take care to avoid propagating inline emphasis inside OBJECT or APPLET. For these elements a fresh inline stack context is created and disposed of upon reaching the end of the element. They thus behave like table cells in this respect. */ if (element->tag->model & CM_OBJECT) { istackbase = lexer->istackbase; lexer->istackbase = lexer->istacksize; } if (!(element->tag->model & CM_MIXED)) InlineDup(lexer, null); mode = IgnoreWhitespace; while ((node = GetToken(lexer, mode /*MixedContent*/)) != null) { /* end tag for this element */ if (node->type == EndTag && node->tag && (node->tag == element->tag || element->was == node->tag)) { FreeNode(node); if (element->tag->model & CM_OBJECT) { /* pop inline stack */ while (lexer->istacksize > lexer->istackbase) PopInline(lexer, null); lexer->istackbase = istackbase; } element->closed = yes; TrimSpaces(lexer, element); TrimEmptyElement(lexer, element); return; } if (node->tag == tag_html || node->tag == tag_head || node->tag == tag_body) { if (node->type == StartTag || node->type == StartEndTag) ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (node->type == EndTag) { if (node->tag == null) { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } else if (node->tag == tag_br) node->type = StartTag; else if (node->tag == tag_p) { CoerceNode(lexer, node, tag_br); FreeAttrs(node); /* discard align attribute etc. */ InsertNodeAtEnd(element, node); node = InferredTag(lexer, "br"); } else { /* if this is the end tag for an ancestor element then infer end tag for this element */ for (parent = element->parent; parent != null; parent = parent->parent) { if (node->tag == parent->tag) { if (!(element->tag->model & CM_OPT)) ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); if (element->tag->model & CM_OBJECT) { /* pop inline stack */ while (lexer->istacksize > lexer->istackbase) PopInline(lexer, null); lexer->istackbase = istackbase; } TrimSpaces(lexer, element); TrimEmptyElement(lexer, element); return; } } /* special case etc. for stuff moved in front of table */ if (lexer->exiled && node->tag->model && (node->tag->model & CM_TABLE)) { UngetToken(lexer); TrimSpaces(lexer, element); TrimEmptyElement(lexer, element); return; } } } /* mixed content model permits text */ if (node->type == TextNode) { Bool iswhitenode = no; if (node->type == TextNode && node->end <= node->start + 1 && lexer->lexbuf[node->start] == ' ') iswhitenode = yes; if (EncloseBlockText && !iswhitenode) { UngetToken(lexer); node = InferredTag(lexer, "p"); InsertNodeAtEnd(element, node); ParseTag(lexer, node, MixedContent); continue; } if (checkstack) { checkstack = no; if (!(element->tag->model & CM_MIXED)) { if (InlineDup(lexer, node) > 0) continue; } } InsertNodeAtEnd(element, node); mode = MixedContent; /* HTML4 strict doesn't allow mixed content for elements with %block; as their content model */ /* But only body, map, blockquote, form and noscript have content model %block; */ if (element->tag == tag_body || element->tag == tag_map || element->tag == tag_blockquote || element->tag == tag_form || element->tag == tag_noscript) ConstrainVersion(lexer, ~VERS_HTML40_STRICT); continue; } if (InsertMisc(element, node)) continue; /* allow PARAM elements? */ if (node->tag == tag_param) { if ((element->tag->model & CM_PARAM) && (node->type == StartEndTag || node->type == StartTag)) { InsertNodeAtEnd(element, node); continue; } /* otherwise discard it */ ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* allow AREA elements? */ if (node->tag == tag_area) { if ((element->tag == tag_map) && (node->type == StartTag || node->type == StartEndTag)) { InsertNodeAtEnd(element, node); continue; } /* otherwise discard it */ ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* ignore unknown start/end tags */ if (node->tag == null) { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* Allow CM_INLINE elements here. Allow CM_BLOCK elements here unless lexer->excludeBlocks is yes. LI and DD are special cased. Otherwise infer end tag for this element. */ if (!(node->tag->model & CM_INLINE)) { if (node->type != StartTag && node->type != StartEndTag) { if (node->tag == tag_form) BadForm(lexer); ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* #427671 - Fix by Randy Waki - 10 Aug 00 */ /* If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION start tag, discard the start tag and let the subsequent content get parsed as content of the enclosing LI. This seems to mimic IE and Netscape, and avoids an infinite loop: without this check, ParseBlock (which is parsing the LI's content) and ParseList (which is parsing the LI's parent's content) repeatedly defer to each other to parse the illegal start tag, each time inferring a missing or
mapto
to
*/ if (node->tag == tag_p && node->type == StartTag && ((mode & Preformatted) || element->tag == tag_dt || DescendantOf(element, tag_dt))) { node->tag = tag_br; MemFree(node->element); node->element = wstrdup("br"); TrimSpaces(lexer, element); InsertNodeAtEnd(element, node); continue; } /* ignore unknown and PARAM tags */ if (node->tag == null || node->tag == tag_param) { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (node->tag == tag_br && node->type == EndTag) node->type = StartTag; if (node->type == EndTag) { /* coerce to
*/ if (node->tag == tag_br) node->type = StartTag; else if (node->tag == tag_p) { /* coerce unmatched
*/ if (!DescendantOf(element, tag_p)) { CoerceNode(lexer, node, tag_br); TrimSpaces(lexer, element); InsertNodeAtEnd(element, node); node = InferredTag(lexer, "br"); continue; } } else if (node->tag->model & CM_INLINE && node->tag != tag_a && !(node->tag->model & CM_OBJECT) && element->tag->model & CM_INLINE) { /* allow any inline end tag to end current element */ PopInline(lexer, element); if (element->tag != tag_a) { if (node->tag == tag_a && node->tag != element->tag) { ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); } else { ReportWarning(lexer, element, node, NON_MATCHING_ENDTAG); FreeNode(node); } if (!(mode & Preformatted)) TrimSpaces(lexer, element); TrimEmptyElement(lexer, element); return; } /* if parent is then discard unexpected inline end tag */ ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* special case etc. for stuff moved in front of table */ else if (lexer->exiled && node->tag->model && (node->tag->model & CM_TABLE)) { UngetToken(lexer); TrimSpaces(lexer, element); TrimEmptyElement(lexer, element); return; } } /* allow any header tag to end current header */ if (node->tag->model & CM_HEADING && element->tag->model & CM_HEADING) { if (node->tag == element->tag) { ReportWarning(lexer, element, node, NON_MATCHING_ENDTAG); FreeNode(node); } else { ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); } if (!(mode & Preformatted)) TrimSpaces(lexer, element); TrimEmptyElement(lexer, element); return; } /* an tag to ends any open element but is mapped to */ /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ /* if (node->tag == tag_a && !node->implicit && IsPushed(lexer, node)) */ if (node->tag == tag_a && !node->implicit && (element->tag == tag_a || DescendantOf(element, tag_a))) { /* coerce to unless it has some attributes */ /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ /* other fixes by Dave Raggett */ /* if (node->attributes == null) */ if (node->type != EndTag && node->attributes == null) { node->type = EndTag; ReportWarning(lexer, element, node, COERCE_TO_ENDTAG); /* PopInline(lexer, node); */ UngetToken(lexer); continue; } UngetToken(lexer); ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); /* PopInline(lexer, element); */ if (!(mode & Preformatted)) TrimSpaces(lexer, element); TrimEmptyElement(lexer, element); return; } if (element->tag->model & CM_HEADING) { if (node->tag == tag_center || node->tag == tag_div) { if (node->type != StartTag && node->type != StartEndTag) { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } ReportWarning(lexer, element, node, TAG_NOT_ALLOWED_IN); /* insert center as parent if heading is empty */ if (element->content == null) { InsertNodeAsParent(element, node); continue; } /* split heading and make center parent of 2nd part */ InsertNodeAfterElement(element, node); if (!(mode & Preformatted)) TrimSpaces(lexer, element); element = CloneNode(lexer, element); InsertNodeAtEnd(node, element); continue; } if (node->tag == tag_hr) { if (node->type != StartTag && node->type != StartEndTag) { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } ReportWarning(lexer, element, node, TAG_NOT_ALLOWED_IN); /* insert hr before heading if heading is empty */ if (element->content == null) { InsertNodeBeforeElement(element, node); continue; } /* split heading and insert hr before 2nd part */ InsertNodeAfterElement(element, node); if (!(mode & Preformatted)) TrimSpaces(lexer, element); element = CloneNode(lexer, element); InsertNodeAfterElement(node, element); continue; } } if (element->tag == tag_dt) { if (node->tag == tag_hr) { Node *dd; if (node->type != StartTag && node->type != StartEndTag) { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } ReportWarning(lexer, element, node, TAG_NOT_ALLOWED_IN); dd = InferredTag(lexer, "dd"); /* insert hr within dd before dt if dt is empty */ if (element->content == null) { InsertNodeBeforeElement(element, dd); InsertNodeAtEnd(dd, node); continue; } /* split dt and insert hr within dd before 2nd part */ InsertNodeAfterElement(element, dd); InsertNodeAtEnd(dd, node); if (!(mode & Preformatted)) TrimSpaces(lexer, element); element = CloneNode(lexer, element); InsertNodeAfterElement(dd, element); continue; } } /* if this is the end tag for an ancestor element then infer end tag for this element */ if (node->type == EndTag) { for (parent = element->parent; parent != null; parent = parent->parent) { if (node->tag == parent->tag) { if (!(element->tag->model & CM_OPT) && !element->implicit) ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); PopInline(lexer, element); UngetToken(lexer); if (!(mode & Preformatted)) TrimSpaces(lexer, element); TrimEmptyElement(lexer, element); return; } } } /* block level tags end this element */ if (!(node->tag->model & CM_INLINE)) { if (node->type != StartTag) { ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (!(element->tag->model & CM_OPT)) ReportWarning(lexer, element, node, MISSING_ENDTAG_BEFORE); if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK)) { MoveToHead(lexer, element, node); continue; } /* prevent anchors from propagating into block tags except for headings h1 to h6 */ if (element->tag == tag_a) { if (node->tag && !(node->tag->model & CM_HEADING)) PopInline(lexer, element); else if (!(element->content)) { DiscardElement(element); UngetToken(lexer); return; } } UngetToken(lexer); if (!(mode & Preformatted)) TrimSpaces(lexer, element); TrimEmptyElement(lexer, element); return; } /* parse inline element */ if (node->type == StartTag || node->type == StartEndTag) { if (node->implicit) ReportWarning(lexer, element, node, INSERTING_TAG); /* trim white space before
*/ if (node->tag == tag_br) TrimSpaces(lexer, element); InsertNodeAtEnd(element, node); ParseTag(lexer, node, mode); continue; } /* discard unexpected tags */ ReportWarning(lexer, element, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } if (!(element->tag->model & CM_OPT)) ReportWarning(lexer, element, node, MISSING_ENDTAG_FOR); TrimEmptyElement(lexer, element); } void ParseEmpty(Lexer *lexer, Node *element, uint mode) { if (lexer->isvoyager) { Node *node = GetToken(lexer, mode); if (!(node->type == EndTag && node->tag == element->tag)) { ReportWarning(lexer, element, node, ELEMENT_NOT_EMPTY); UngetToken(lexer); } } } void ParseDefList(Lexer *lexer, Node *list, uint mode) { Node *node, *parent; if (list->tag->model & CM_EMPTY) return; lexer->insert = null; /* defer implicit inline start tags */ while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == list->tag && node->type == EndTag) { FreeNode(node); list->closed = yes; TrimEmptyElement(lexer, list); return; } /* deal with comments etc. */ if (InsertMisc(list, node)) continue; if (node->type == TextNode) { UngetToken(lexer); node = InferredTag(lexer, "dt"); ReportWarning(lexer, list, node, MISSING_STARTTAG); } if (node->tag == null) { ReportWarning(lexer, list, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* if this is the end tag for an ancestor element then infer end tag for this element */ if (node->type == EndTag) { if (node->tag == tag_form) { BadForm(lexer); ReportWarning(lexer, list, node, DISCARDING_UNEXPECTED); continue; } for (parent = list->parent; parent != null; parent = parent->parent) { if (node->tag == parent->tag) { ReportWarning(lexer, list, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); TrimEmptyElement(lexer, list); return; } } } /* center in a dt or a dl breaks the dl list in two */ if (node->tag == tag_center) { if (list->content) InsertNodeAfterElement(list, node); else /* trim empty dl list */ { InsertNodeBeforeElement(list, node); DiscardElement(list); } /* #426885 - fix by Glenn Carroll 19 Apr 00, and Gary Dechaines 11 Aug 00 */ /* ParseTag can destroy node, if it finds that * thisis followed immediately by . * It's awkward but necessary to determine if this * has happened. */ parent = node->parent; /* and parse contents of center */ lexer->excludeBlocks = no; ParseTag(lexer, node, mode); lexer->excludeBlocks = yes; /* now create a new dl element, * unless node has been blown away because the * center was empty, as above. */ if (parent->last == node) { list = InferredTag(lexer, "dl"); InsertNodeAfterElement(node, list); } continue; } if (!(node->tag == tag_dt || node->tag == tag_dd)) { UngetToken(lexer); if (!(node->tag->model & (CM_BLOCK | CM_INLINE))) { ReportWarning(lexer, list, node, TAG_NOT_ALLOWED_IN); TrimEmptyElement(lexer, list); return; } /* if DD appeared directly in BODY then exclude blocks */ if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks) { TrimEmptyElement(lexer, list); return; } node = InferredTag(lexer, "dd"); ReportWarning(lexer, list, node, MISSING_STARTTAG); } if (node->type == EndTag) { ReportWarning(lexer, list, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* node should be
or | */ InsertNodeAtEnd(row, node); exclude_state = lexer->excludeBlocks; lexer->excludeBlocks = no; ParseTag(lexer, node, IgnoreWhitespace); lexer->excludeBlocks = exclude_state; /* pop inline stack */ while (lexer->istacksize > lexer->istackbase) PopInline(lexer, null); } TrimEmptyElement(lexer, row); } void ParseRowGroup(Lexer *lexer, Node *rowgroup, uint mode) { Node *node, *parent; if (rowgroup->tag->model & CM_EMPTY) return; while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == rowgroup->tag) { if (node->type == EndTag) { rowgroup->closed = yes; TrimEmptyElement(lexer, rowgroup); FreeNode(node); return; } UngetToken(lexer); return; } /* if |
---|
in
*/ TrimSpaces(lexer, pre); /* coerce bothand
to
*/ CoerceNode(lexer, node, tag_br); FreeAttrs(node); /* discard align attribute etc. */ InsertNodeAtEnd(pre, node); } else { ReportWarning(lexer, pre, node, DISCARDING_UNEXPECTED); FreeNode(node); } continue; } if (node->type == StartTag || node->type == StartEndTag) { /* trim white space before
*/ if (node->tag == tag_br) TrimSpaces(lexer, pre); InsertNodeAtEnd(pre, node); ParseTag(lexer, node, Preformatted); continue; } /* discard unexpected tags */ ReportWarning(lexer, pre, node, DISCARDING_UNEXPECTED); FreeNode(node); } ReportWarning(lexer, pre, node, MISSING_ENDTAG_FOR); TrimEmptyElement(lexer, pre); } void ParseOptGroup(Lexer *lexer, Node *field, uint mode) { Node *node; lexer->insert = null; /* defer implicit inline start tags */ while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == field->tag && node->type == EndTag) { FreeNode(node); field->closed = yes; TrimSpaces(lexer, field); return; } /* deal with comments etc. */ if (InsertMisc(field, node)) continue; if (node->type == StartTag && (node->tag == tag_option || node->tag == tag_optgroup)) { if (node->tag == tag_optgroup) ReportWarning(lexer, field, node, CANT_BE_NESTED); InsertNodeAtEnd(field, node); ParseTag(lexer, node, MixedContent); continue; } /* discard unexpected tags */ ReportWarning(lexer, field, node, DISCARDING_UNEXPECTED); FreeNode(node); } } void ParseSelect(Lexer *lexer, Node *field, uint mode) { Node *node; lexer->insert = null; /* defer implicit inline start tags */ while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == field->tag && node->type == EndTag) { FreeNode(node); field->closed = yes; TrimSpaces(lexer, field); return; } /* deal with comments etc. */ if (InsertMisc(field, node)) continue; if (node->type == StartTag && (node->tag == tag_option || node->tag == tag_optgroup || node->tag == tag_script)) { InsertNodeAtEnd(field, node); ParseTag(lexer, node, IgnoreWhitespace); continue; } /* discard unexpected tags */ ReportWarning(lexer, field, node, DISCARDING_UNEXPECTED); FreeNode(node); } ReportWarning(lexer, field, node, MISSING_ENDTAG_FOR); } void ParseText(Lexer *lexer, Node *field, uint mode) { Node *node; lexer->insert = null; /* defer implicit inline start tags */ if (field->tag == tag_textarea) mode = Preformatted; else mode = MixedContent; /* kludge for font tags */ while ((node = GetToken(lexer, mode)) != null) { if (node->tag == field->tag && node->type == EndTag) { FreeNode(node); field->closed = yes; TrimSpaces(lexer, field); return; } /* deal with comments etc. */ if (InsertMisc(field, node)) continue; if (node->type == TextNode) { /* only called for 1st child */ if (field->content == null && !(mode & Preformatted)) TrimSpaces(lexer, field); if (node->start >= node->end) { FreeNode(node); continue; } InsertNodeAtEnd(field, node); continue; } /* for textarea should all cases of < and & be escaped? */ /* discard inline tags e.g. font */ if (node->tag && (node->tag->model & CM_INLINE)) { ReportWarning(lexer, field, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* terminate element on other tags */ if (!(field->tag->model & CM_OPT)) ReportWarning(lexer, field, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); TrimSpaces(lexer, field); return; } if (!(field->tag->model & CM_OPT)) ReportWarning(lexer, field, node, MISSING_ENDTAG_FOR); } void ParseTitle(Lexer *lexer, Node *title, uint mode) { Node *node; while ((node = GetToken(lexer, MixedContent)) != null) { if (node->tag == title->tag && node->type == StartTag) { ReportWarning(lexer, title, node, COERCE_TO_ENDTAG); node->type = EndTag; UngetToken(lexer); continue; } else if (node->tag == title->tag && node->type == EndTag) { FreeNode(node); title->closed = yes; TrimSpaces(lexer, title); return; } if (node->type == TextNode) { /* only called for 1st child */ if (title->content == null) TrimInitialSpace(lexer, title, node); if (node->start >= node->end) { FreeNode(node); continue; } InsertNodeAtEnd(title, node); continue; } /* deal with comments etc. */ if (InsertMisc(title, node)) continue; /* discard unknown tags */ if (node->tag == null) { ReportWarning(lexer, title, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* pushback unexpected tokens */ ReportWarning(lexer, title, node, MISSING_ENDTAG_BEFORE); UngetToken(lexer); TrimSpaces(lexer, title); return; } ReportWarning(lexer, title, node, MISSING_ENDTAG_FOR); } /* This isn't quite right for CDATA content as it recognises tags within the content and parses them accordingly. This will unfortunately screw up scripts which include < + letter, < + !, < + ? or < + / + letter */ void ParseScript(Lexer *lexer, Node *script, uint mode) { Node *node; node = GetCDATA(lexer, script); if (node) InsertNodeAtEnd(script, node); } Bool IsJavaScript(Node *node) { Bool result = no; AttVal *attr; if (node->attributes == null) return yes; for (attr = node->attributes; attr; attr = attr->next) { if ( (wstrcasecmp(attr->attribute, "language") == 0 || wstrcasecmp(attr->attribute, "type") == 0) && wsubstr(attr->value, "javascript")) result = yes; } return result; } void ParseHead(Lexer *lexer, Node *head, uint mode) { Node *node; int HasTitle = 0; int HasBase = 0; while ((node = GetToken(lexer, IgnoreWhitespace)) != null) { if (node->tag == head->tag && node->type == EndTag) { FreeNode(node); head->closed = yes; break; } if (node->type == TextNode) { UngetToken(lexer); break; } /* deal with comments etc. */ if (InsertMisc(head, node)) continue; if (node->type == DocTypeTag) { InsertDocType(lexer, head, node); continue; } /* discard unknown tags */ if (node->tag == null) { ReportWarning(lexer, head, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* if it doesn't belong in the head then treat as implicit head of head and deal with as part of the body */ if (!(node->tag->model & CM_HEAD)) { UngetToken(lexer); break; } if (node->type == StartTag || node->type == StartEndTag) { if (node->tag == tag_title) { ++HasTitle; if (HasTitle > 1) ReportWarning(lexer, head, node, TOO_MANY_ELEMENTS); } else if (node->tag == tag_base) { ++HasBase; if (HasBase > 1) ReportWarning(lexer, head, node, TOO_MANY_ELEMENTS); } else if (node->tag == tag_noscript) ReportWarning(lexer, head, node, TAG_NOT_ALLOWED_IN); InsertNodeAtEnd(head, node); ParseTag(lexer, node, IgnoreWhitespace); continue; } /* discard unexpected text nodes and end tags */ ReportWarning(lexer, head, node, DISCARDING_UNEXPECTED); FreeNode(node); } if (HasTitle == 0) { if (!BodyOnly) /* Feature request #434940 - fix by Ignacio Vazquez-Abrams 21 Jun 01 */ ReportWarning(lexer, head, null, MISSING_TITLE_ELEMENT); InsertNodeAtEnd(head, InferredTag(lexer, "title")); } } void ParseBody(Lexer *lexer, Node *body, uint mode) { Node *node; Bool checkstack, iswhitenode; mode = IgnoreWhitespace; checkstack = yes; BumpObject(lexer, body->parent); while ((node = GetToken(lexer, mode)) != null) { if (node->tag == body->tag && node->type == EndTag) { body->closed = yes; TrimSpaces(lexer, body); FreeNode(node); SeenBodyEndTag = 1; mode = IgnoreWhitespace; if (body->parent->tag == tag_noframes) break; continue; } if (node->tag == tag_noframes) { if (node->type == StartTag) { InsertNodeAtEnd(body, node); ParseBlock(lexer, node, mode); continue; } if (node->type == EndTag && body->parent->tag == tag_noframes) { TrimSpaces(lexer, body); UngetToken(lexer); break; } } if ((node->tag == tag_frame || node->tag == tag_frameset) && body->parent->tag == tag_noframes) { TrimSpaces(lexer, body); UngetToken(lexer); break; } if (node->tag == tag_html) { if (node->type == StartTag || node->type == StartEndTag) ReportWarning(lexer, body, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } iswhitenode = no; if (node->type == TextNode && node->end <= node->start + 1 && lexer->lexbuf[node->start] == ' ') iswhitenode = yes; /* deal with comments etc. */ if (InsertMisc(body, node)) continue; if (SeenBodyEndTag == 1 && !iswhitenode) { ++SeenBodyEndTag; ReportWarning(lexer, body, node, CONTENT_AFTER_BODY); } /* mixed content model permits text */ if (node->type == TextNode) { if (iswhitenode && mode == IgnoreWhitespace) { FreeNode(node); continue; } if (EncloseBodyText && !iswhitenode) { Node *para; UngetToken(lexer); para = InferredTag(lexer, "p"); InsertNodeAtEnd(body, para); ParseTag(lexer, para, mode); mode = MixedContent; continue; } else /* HTML 2 and HTML4 strict don't allow text here */ ConstrainVersion(lexer, ~(VERS_HTML40_STRICT | VERS_HTML20)); if (checkstack) { checkstack = no; if (InlineDup(lexer, node) > 0) continue; } InsertNodeAtEnd(body, node); mode = MixedContent; continue; } if (node->type == DocTypeTag) { InsertDocType(lexer, body, node); continue; } /* discard unknown and PARAM tags */ if (node->tag == null || node->tag == tag_param) { ReportWarning(lexer, body, node, DISCARDING_UNEXPECTED); FreeNode(node); continue; } /* Netscape allows LI and DD directly in BODY We infer UL or DL respectively and use this Bool to exclude block-level elements so as to match Netscape's observed behaviour. */ lexer->excludeBlocks = no; if ((!(node->tag->model & CM_BLOCK) && !(node->tag->model & CM_INLINE)) || node->tag == tag_input) { /* avoid this error message being issued twice */ if (!(node->tag->model & CM_HEAD)) ReportWarning(lexer, body, node, TAG_NOT_ALLOWED_IN); if (node->tag->model & CM_HTML) { /* copy body attributes if current body was inferred */ if (node->tag == tag_body && body->implicit && body->attributes == null) { body->attributes = node->attributes; node->attributes = null; } FreeNode(node); continue; } if (node->tag->model & CM_HEAD) { MoveToHead(lexer, body, node); continue; } if (node->tag->model & CM_LIST) { UngetToken(lexer); node = InferredTag(lexer, "ul"); AddClass(node, "noindent"); lexer->excludeBlocks = yes; } else if (node->tag->model & CM_DEFLIST) { UngetToken(lexer); node = InferredTag(lexer, "dl"); lexer->excludeBlocks = yes; } else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW)) { UngetToken(lexer); node = InferredTag(lexer, "table"); lexer->excludeBlocks = yes; } else if (node->tag == tag_input) { UngetToken(lexer); node = InferredTag(lexer, "form"); lexer->excludeBlocks = yes; } else { if (!(node->tag->model & (CM_ROW | CM_FIELD))) { UngetToken(lexer); return; } /* ignore