2 * @(#)Lexer.java 1.11 2000/08/16
10 * Lexer for html parser
12 * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
13 * See Tidy.java for the copyright notice.
14 * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
15 * HTML Tidy Release 4 Aug 2000</a>
17 * @author Dave Raggett <dsr@w3.org>
18 * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
19 * @version 1.0, 1999/05/22
20 * @version 1.0.1, 1999/05/29
21 * @version 1.1, 1999/06/18 Java Bean
22 * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
23 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
24 * @version 1.4, 1999/09/04 DOM support
25 * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
26 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
27 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
28 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
29 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
30 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
31 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
35 Given a file stream fp it returns a sequence of tokens.
37 GetToken(fp) gets the next token
38 UngetToken(fp) provides one level undo
40 The tags include an attribute list:
42 - linked list of attribute/value nodes
43 - each node has 2 null-terminated strings.
44 - entities are replaced in attribute values
46 white space is compacted if not in preformatted mode
47 If not in preformatted mode then leading white space
48 is discarded and subsequent white space sequences
49 compacted to single space chars.
51 If XmlTags is no then Tag names are folded to upper
52 case and attribute names to lower case.
55 - Doctype subset and marked sections
58 import java.io.PrintWriter;
59 import java.util.Stack;
60 import java.util.Vector;
62 import org.eclipse.core.resources.IFile;
63 import sun.security.krb5.internal.av;
68 public StreamIn in; /* file stream */
69 public PrintWriter errout; /* error output stream */
70 public short badAccess; /* for accessibility errors */
71 public short badLayout; /* for bad style errors */
72 public short badChars; /* for bad char encodings */
73 public short badForm; /* for mismatched/mispositioned form tags */
74 public short warnings; /* count of warnings in this document */
75 public short errors; /* count of errors */
76 public int lines; /* lines seen */
77 public int columns; /* at start of current token */
78 public boolean waswhite; /* used to collapse contiguous white space */
79 public boolean pushed; /* true after token has been pushed back */
80 public boolean insertspace; /* when space is moved after end tag */
81 public boolean excludeBlocks; /* Netscape compatibility */
82 public boolean exiled; /* true if moved out of table */
83 public boolean isvoyager; /* true if xmlns attribute on html element */
84 public short versions; /* bit vector of HTML versions */
85 public int doctype; /* version as given by doctype (if any) */
86 public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
87 public int txtstart; /* start of current node */
88 public int txtend; /* end of current node */
89 public short state; /* state of lexer's finite state machine */
93 lexer character buffer
95 parse tree nodes span onto this buffer
96 which contains the concatenated text
97 contents of all of the elements.
99 lexsize must be reset for each file.
101 public byte[] lexbuf; /* byte buffer of UTF-8 chars */
102 public int lexlength; /* allocated */
103 public int lexsize; /* used */
105 /* Inline stack for compatibility with Mosaic */
106 public Node inode; /* for deferring text node */
107 public int insert; /* for inferring inline tags */
109 public int istackbase; /* start of frame */
111 public Style styles; /* used for cleaning up presentation markup */
113 public Configuration configuration;
114 protected int seenBodyEndTag; /* used by parser */
115 private Vector nodeList;
117 public Lexer(IFile iFile, StreamIn in, Configuration configuration)
123 this.state = LEX_CONTENT;
130 this.waswhite = false;
132 this.insertspace = false;
134 this.isvoyager = false;
135 this.versions = Dict.VERS_EVERYTHING;
136 this.doctype = Dict.VERS_UNKNOWN;
137 this.badDoctype = false;
146 this.istack = new Stack();
149 this.configuration = configuration;
150 this.seenBodyEndTag = 0;
151 this.nodeList = new Vector();
154 public IFile getIFile() {
158 public Node newNode()
160 Node node = new Node();
161 nodeList.addElement(node);
165 public Node newNode(short type, byte[] textarray, int start, int end)
167 Node node = new Node(type, textarray, start, end);
168 nodeList.addElement(node);
172 public Node newNode(short type, byte[] textarray, int start, int end, String element)
174 Node node = new Node(type, textarray, start, end, element, configuration.tt);
175 nodeList.addElement(node);
179 public Node cloneNode(Node node)
181 Node cnode = (Node)node.clone();
182 nodeList.addElement(cnode);
183 for (AttVal att = cnode.attributes; att != null; att = att.next) {
185 nodeList.addElement(att.asp);
187 nodeList.addElement(att.php);
192 public AttVal cloneAttributes(AttVal attrs)
194 AttVal cattrs = (AttVal)attrs.clone();
195 for (AttVal att = cattrs; att != null; att = att.next) {
197 nodeList.addElement(att.asp);
199 nodeList.addElement(att.php);
204 protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
207 for (int i = 0; i < nodeList.size(); i++) {
208 node = (Node)(nodeList.elementAt(i));
209 if (node.textarray == oldtextarray)
210 node.textarray = newtextarray;
214 /* used for creating preformatted text from Word2000 */
215 public Node newLineNode()
217 Node node = newNode();
219 node.textarray = this.lexbuf;
220 node.start = this.lexsize;
221 addCharToLexer((int)'\n');
222 node.end = this.lexsize;
226 // Should always be able convert to/from UTF-8, so encoding exceptions are
227 // converted to an Error to avoid adding throws declarations in
230 public static byte[] getBytes(String str) {
232 return str.getBytes("UTF8");
233 } catch (java.io.UnsupportedEncodingException e) {
234 throw new Error("string to UTF-8 conversion failed: " + e.getMessage());
238 public static String getString(byte[] bytes, int offset, int length) {
240 return new String(bytes, offset, length, "UTF8");
241 } catch (java.io.UnsupportedEncodingException e) {
242 throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
246 public boolean endOfInput()
248 return this.in.isEndOfStream();
251 public void addByte(int c)
253 if (this.lexsize + 1 >= this.lexlength)
255 while (this.lexsize + 1 >= this.lexlength)
257 if (this.lexlength == 0)
258 this.lexlength = 8192;
260 this.lexlength = this.lexlength * 2;
263 byte[] temp = this.lexbuf;
264 this.lexbuf = new byte[ this.lexlength ];
267 System.arraycopy( temp, 0, this.lexbuf, 0, temp.length );
268 updateNodeTextArrays(temp, this.lexbuf);
272 this.lexbuf[this.lexsize++] = (byte)c;
273 this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */
276 public void changeChar(byte c)
278 if (this.lexsize > 0)
280 this.lexbuf[this.lexsize-1] = c;
284 /* store char c as UTF-8 encoded byte stream */
285 public void addCharToLexer(int c)
291 addByte(0xC0 | (c >> 6));
292 addByte(0x80 | (c & 0x3F));
294 else if (c <= 0xFFFF)
296 addByte(0xE0 | (c >> 12));
297 addByte(0x80 | ((c >> 6) & 0x3F));
298 addByte(0x80 | (c & 0x3F));
300 else if (c <= 0x1FFFFF)
302 addByte(0xF0 | (c >> 18));
303 addByte(0x80 | ((c >> 12) & 0x3F));
304 addByte(0x80 | ((c >> 6) & 0x3F));
305 addByte(0x80 | (c & 0x3F));
309 addByte(0xF8 | (c >> 24));
310 addByte(0x80 | ((c >> 18) & 0x3F));
311 addByte(0x80 | ((c >> 12) & 0x3F));
312 addByte(0x80 | ((c >> 6) & 0x3F));
313 addByte(0x80 | (c & 0x3F));
317 public void addStringToLexer(String str)
319 for ( int i = 0; i < str.length(); i++ ) {
320 addCharToLexer( (int)str.charAt(i) );
325 No longer attempts to insert missing ';' for unknown
326 enitities unless one was present already, since this
327 gives unexpected results.
329 For example: <a href="something.htm?foo&bar&fred">
330 was tidied to: <a href="something.htm?foo&bar;&fred;">
331 rather than: <a href="something.htm?foo&bar&fred">
333 My thanks for Maurice Buxton for spotting this.
335 public void parseEntity(short mode)
339 boolean first = true;
340 boolean semicolon = false;
341 boolean numeric = false;
345 start = this.lexsize - 1; /* to start at "&" */
346 startcol = this.in.curcol - 1;
350 c = this.in.readChar();
351 if (c == StreamIn.EndOfStream) break;
358 if (first && c == '#')
369 /* AQ: Added flag for numeric entities so that numeric entities
370 with missing semi-colons are recognized.
371 Eg. "rep..." is recognized as "rep"
373 if (numeric && ((c == 'x') || ((map & DIGIT) != 0)))
378 if (!numeric && ((map & NAMECHAR) != 0))
384 /* otherwise put it back */
386 this.in.ungetChar(c);
390 str = getString( this.lexbuf, start, this.lexsize - start );
391 ch = EntityTable.getDefaultEntityTable().entityCode( str );
393 /* deal with unrecognized entities */
396 /* set error position just before offending chararcter */
397 this.lines = this.in.curline;
398 this.columns = startcol;
400 if (this.lexsize > start +1 )
402 Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
409 Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
414 if (c != ';') /* issue warning if not terminated by ';' */
416 /* set error position just before offending chararcter */
417 this.lines = this.in.curline;
418 this.columns = startcol;
419 Report.entityError(this, Report.MISSING_SEMICOLON, str, c);
422 this.lexsize = start;
424 if (ch == 160 && (mode & Preformatted) != 0)
429 if (ch == '&' && !this.configuration.QuoteAmpersand)
439 public char parseTagName()
444 /* fold case of first char in buffer */
446 c = this.lexbuf[this.txtstart];
449 if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
451 c += (int)((int)'a' - (int)'A');
452 this.lexbuf[this.txtstart] = (byte)c;
457 c = this.in.readChar();
458 if (c == StreamIn.EndOfStream) break;
461 if ((map & NAMECHAR) == 0)
464 /* fold case of subsequent chars */
466 if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
467 c += (int)((int)'a' - (int)'A');
472 this.txtend = this.lexsize;
476 public void addStringLiteral(String str)
478 for ( int i = 0; i < str.length(); i++ ) {
479 addCharToLexer( (int)str.charAt(i) );
483 /* choose what version to use for new doctype */
484 public short HTMLVersion()
488 versions = this.versions;
490 if ((versions & Dict.VERS_HTML20) != 0)
491 return Dict.VERS_HTML20;
493 if ((versions & Dict.VERS_HTML32) != 0)
494 return Dict.VERS_HTML32;
496 if ((versions & Dict.VERS_HTML40_STRICT) != 0)
497 return Dict.VERS_HTML40_STRICT;
499 if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
500 return Dict.VERS_HTML40_LOOSE;
502 if ((versions & Dict.VERS_FRAMES) != 0)
503 return Dict.VERS_FRAMES;
505 return Dict.VERS_UNKNOWN;
508 public String HTMLVersionName()
513 guessed = apparentVersion();
515 for (j = 0; j < W3CVersion.length; ++j)
517 if (guessed == W3CVersion[j].code)
520 return W3CVersion[j].voyagerName;
522 return W3CVersion[j].name;
529 /* add meta element for Tidy */
530 public boolean addGenerator(Node root)
534 Node head = root.findHEAD(configuration.tt);
538 for (node = head.content; node != null; node = node.next)
540 if (node.tag == configuration.tt.tagMeta)
542 attval = node.getAttrByName("name");
544 if (attval != null && attval.value != null &&
545 Lexer.wstrcasecmp(attval.value, "generator") == 0)
547 attval = node.getAttrByName("content");
549 if (attval != null && attval.value != null &&
550 attval.value.length() >= 9 &&
551 Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0)
559 node = this.inferredTag("meta");
560 node.addAttribute("content", "HTML Tidy, see www.w3.org");
561 node.addAttribute("name", "generator");
562 Node.insertNodeAtStart(head, node);
569 /* return true if substring s is in p and isn't all in upper case */
570 /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
571 /* len is how many chars to check in p */
572 private static boolean findBadSubString(String s, String p, int len)
580 ps = p.substring(i, i + n);
581 if (wstrcasecmp(s, ps) == 0)
582 return (!ps.equals(s.substring(0, n)));
591 public boolean checkDocTypeKeyWords(Node doctype)
593 int len = doctype.end - doctype.start;
594 String s = getString(this.lexbuf, doctype.start, len);
597 findBadSubString("SYSTEM", s, len) ||
598 findBadSubString("PUBLIC", s, len) ||
599 findBadSubString("//DTD", s, len) ||
600 findBadSubString("//W3C", s, len) ||
601 findBadSubString("//EN", s, len)
605 /* examine <!DOCTYPE> to identify version */
606 public short findGivenVersion(Node doctype)
614 /* if root tag for doctype isn't html give up now */
615 str1 = getString(this.lexbuf, doctype.start, 5);
616 if (wstrcasecmp(str1, "html ") != 0)
619 if (!checkDocTypeKeyWords(doctype))
620 Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
622 /* give up if all we are given is the system id for the doctype */
623 str1 = getString(this.lexbuf, doctype.start + 5, 7);
624 if (wstrcasecmp(str1, "SYSTEM ") == 0)
626 /* but at least ensure the case is correct */
627 if (!str1.substring(0, 6).equals("SYSTEM"))
628 System.arraycopy( getBytes("SYSTEM"), 0,
629 this.lexbuf, doctype.start + 5, 6 );
630 return 0; /* unrecognized */
633 if (wstrcasecmp(str1, "PUBLIC ") == 0)
635 if (!str1.substring(0, 6).equals("PUBLIC"))
636 System.arraycopy( getBytes("PUBLIC "), 0,
637 this.lexbuf, doctype.start + 5, 6 );
640 this.badDoctype = true;
642 for (i = doctype.start; i < doctype.end; ++i)
644 if (this.lexbuf[i] == (byte)'"')
646 str1 = getString( this.lexbuf, i + 1, 12 );
647 str2 = getString( this.lexbuf, i + 1, 13 );
648 if (str1.equals("-//W3C//DTD "))
650 /* compute length of identifier e.g. "HTML 4.0 Transitional" */
651 for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
653 p = getString( this.lexbuf, i + 13, len );
655 for (j = 1; j < W3CVersion.length; ++j)
657 s = W3CVersion[j].name;
658 if (len == s.length() && s.equals(p))
659 return W3CVersion[j].code;
662 /* else unrecognized version */
664 else if (str2.equals("-//IETF//DTD "))
666 /* compute length of identifier e.g. "HTML 2.0" */
667 for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
670 p = getString( this.lexbuf, i + 14, len );
671 s = W3CVersion[0].name;
672 if (len == s.length() && s.equals(p))
673 return W3CVersion[0].code;
675 /* else unrecognized version */
684 public void fixHTMLNameSpace(Node root, String profile)
689 for (node = root.content;
690 node != null && node.tag != configuration.tt.tagHtml; node = node.next);
696 for (attr = node.attributes; attr != null; attr = attr.next)
698 if (attr.attribute.equals("xmlns"))
706 if (!attr.value.equals(profile))
708 Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
709 attr.value = profile;
714 attr = new AttVal( node.attributes, null, (int)'"',
717 AttributeTable.getDefaultAttributeTable().findAttribute( attr );
718 node.attributes = attr;
723 public boolean setXHTMLDocType(Node root)
727 String namespace = XHTML_NAMESPACE;
730 doctype = root.findDocType();
732 if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
735 Node.discardElement(doctype);
739 if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
741 /* see what flavor of XHTML this document matches */
742 if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
743 { /* use XHTML strict */
744 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
745 sysid = voyager_strict;
747 else if ((this.versions & Dict.VERS_LOOSE) != 0)
749 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
750 sysid = voyager_loose;
752 else if ((this.versions & Dict.VERS_FRAMES) != 0)
753 { /* use XHTML frames */
754 fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
755 sysid = voyager_frameset;
757 else /* lets assume XHTML transitional */
759 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
760 sysid = voyager_loose;
763 else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
765 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
766 sysid = voyager_strict;
768 else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
770 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
771 sysid = voyager_loose;
774 fixHTMLNameSpace(root, namespace);
778 doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
779 doctype.next = root.content;
780 doctype.parent = root;
782 root.content = doctype;
785 if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
786 configuration.docTypeStr != null)
788 fpi = configuration.docTypeStr;
792 this.txtstart = this.lexsize;
793 this.txtend = this.lexsize;
795 /* add public identifier */
796 addStringLiteral("html PUBLIC ");
798 /* check if the fpi is quoted or not */
799 if (fpi.charAt(0) == '"')
800 addStringLiteral(fpi);
803 addStringLiteral("\"");
804 addStringLiteral(fpi);
805 addStringLiteral("\"");
808 if (sysid.length() + 6 >= this.configuration.wraplen)
809 addStringLiteral("\n\"");
811 addStringLiteral("\n \"");
813 /* add system identifier */
814 addStringLiteral(sysid);
815 addStringLiteral("\"");
817 this.txtend = this.lexsize;
819 doctype.start = this.txtstart;
820 doctype.end = this.txtend;
825 public short apparentVersion()
827 switch (this.doctype)
829 case Dict.VERS_UNKNOWN:
830 return HTMLVersion();
832 case Dict.VERS_HTML20:
833 if ((this.versions & Dict.VERS_HTML20) != 0)
834 return Dict.VERS_HTML20;
838 case Dict.VERS_HTML32:
839 if ((this.versions & Dict.VERS_HTML32) != 0)
840 return Dict.VERS_HTML32;
842 break; /* to replace old version by new */
844 case Dict.VERS_HTML40_STRICT:
845 if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
846 return Dict.VERS_HTML40_STRICT;
850 case Dict.VERS_HTML40_LOOSE:
851 if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
852 return Dict.VERS_HTML40_LOOSE;
854 break; /* to replace old version by new */
856 case Dict.VERS_FRAMES:
857 if ((this.versions & Dict.VERS_FRAMES) != 0)
858 return Dict.VERS_FRAMES;
863 Report.warning(this, null, null, Report.INCONSISTENT_VERSION);
864 return this.HTMLVersion();
867 /* fixup doctype if missing */
868 public boolean fixDocType(Node root)
871 int guessed = Dict.VERS_HTML40_STRICT, i;
874 Report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
876 if (configuration.XmlOut)
879 doctype = root.findDocType();
881 if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
884 Node.discardElement(doctype);
888 if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
890 Node.discardElement(doctype);
892 guessed = Dict.VERS_HTML40_STRICT;
894 else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
896 Node.discardElement(doctype);
898 guessed = Dict.VERS_HTML40_LOOSE;
900 else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
904 if (this.doctype == Dict.VERS_UNKNOWN)
907 switch (this.doctype)
909 case Dict.VERS_UNKNOWN:
912 case Dict.VERS_HTML20:
913 if ((this.versions & Dict.VERS_HTML20) != 0)
916 break; /* to replace old version by new */
918 case Dict.VERS_HTML32:
919 if ((this.versions & Dict.VERS_HTML32) != 0)
922 break; /* to replace old version by new */
924 case Dict.VERS_HTML40_STRICT:
925 if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
928 break; /* to replace old version by new */
930 case Dict.VERS_HTML40_LOOSE:
931 if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
934 break; /* to replace old version by new */
936 case Dict.VERS_FRAMES:
937 if ((this.versions & Dict.VERS_FRAMES) != 0)
940 break; /* to replace old version by new */
943 /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
946 /* choose new doctype */
947 guessed = HTMLVersion();
950 if (guessed == Dict.VERS_UNKNOWN)
953 /* for XML use the Voyager system identifier */
954 if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager)
957 Node.discardElement(doctype);
959 for (i = 0; i < W3CVersion.length; ++i)
961 if (guessed == W3CVersion[i].code)
963 fixHTMLNameSpace(root, W3CVersion[i].profile);
973 doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
974 doctype.next = root.content;
975 doctype.parent = root;
977 root.content = doctype;
980 this.txtstart = this.lexsize;
981 this.txtend = this.lexsize;
983 /* use the appropriate public identifier */
984 addStringLiteral("html PUBLIC ");
986 if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
987 configuration.docTypeStr != null)
988 addStringLiteral(configuration.docTypeStr);
989 else if (guessed == Dict.VERS_HTML20)
990 addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
993 addStringLiteral("\"-//W3C//DTD ");
995 for (i = 0; i < W3CVersion.length; ++i)
997 if (guessed == W3CVersion[i].code)
999 addStringLiteral(W3CVersion[i].name);
1004 addStringLiteral("//EN\"");
1007 this.txtend = this.lexsize;
1009 doctype.start = this.txtstart;
1010 doctype.end = this.txtend;
1015 /* ensure XML document starts with <?XML version="1.0"?> */
1016 public boolean fixXMLPI(Node root)
1021 if( root.content != null && root.content.type == Node.ProcInsTag)
1023 s = root.content.start;
1025 if (this.lexbuf[s] == (byte)'x' &&
1026 this.lexbuf[s+1] == (byte)'m' &&
1027 this.lexbuf[s+2] == (byte)'l')
1031 xml = newNode(Node.ProcInsTag, this.lexbuf, 0, 0);
1032 xml.next = root.content;
1034 if (root.content != null)
1036 root.content.prev = xml;
1037 xml.next = root.content;
1042 this.txtstart = this.lexsize;
1043 this.txtend = this.lexsize;
1044 addStringLiteral("xml version=\"1.0\"");
1045 if (this.configuration.CharEncoding == Configuration.LATIN1)
1046 addStringLiteral(" encoding=\"ISO-8859-1\"");
1047 this.txtend = this.lexsize;
1049 xml.start = this.txtstart;
1050 xml.end = this.txtend;
1054 public Node inferredTag(String name)
1058 node = newNode(Node.StartTag,
1063 node.implicit = true;
1067 public static boolean expectsContent(Node node)
1069 if (node.type != Node.StartTag)
1072 /* unknown element? */
1073 if (node.tag == null)
1076 if ((node.tag.model & Dict.CM_EMPTY) != 0)
1083 create a text node for the contents of
1084 a CDATA element like style or script
1085 which ends with </foo> for some foo.
1087 public Node getCDATA(Node container)
1089 int c, lastc, start, len, i;
1091 boolean endtag = false;
1093 this.lines = this.in.curline;
1094 this.columns = this.in.curcol;
1095 this.waswhite = false;
1096 this.txtstart = this.lexsize;
1097 this.txtend = this.lexsize;
1104 c = this.in.readChar();
1105 if (c == StreamIn.EndOfStream) break;
1106 /* treat \r\n as \n and \r as \n */
1108 if (c == (int)'/' && lastc == (int)'<')
1112 this.lines = this.in.curline;
1113 this.columns = this.in.curcol - 3;
1115 Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1118 start = this.lexsize + 1; /* to first letter */
1121 else if (c == (int)'>' && start >= 0)
1123 len = this.lexsize - start;
1124 if (len == container.element.length())
1126 str = getString( this.lexbuf, start, len );
1127 if (Lexer.wstrcasecmp(str, container.element) == 0)
1129 this.txtend = start - 2;
1134 this.lines = this.in.curline;
1135 this.columns = this.in.curcol - 3;
1137 Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1139 /* if javascript insert backslash before / */
1141 if (ParserImpl.isJavaScript(container))
1143 for (i = this.lexsize; i > start-1; --i)
1144 this.lexbuf[i] = this.lexbuf[i-1];
1146 this.lexbuf[start-1] = (byte)'\\';
1152 else if (c == (int)'\r')
1154 c = this.in.readChar();
1157 this.in.ungetChar(c);
1162 addCharToLexer((int)c);
1163 this.txtend = this.lexsize;
1167 if (c == StreamIn.EndOfStream)
1168 Report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
1170 if (this.txtend > this.txtstart)
1172 this.token = newNode(Node.TextNode,
1182 public void ungetToken()
1187 public static final short IgnoreWhitespace = 0;
1188 public static final short MixedContent = 1;
1189 public static final short Preformatted = 2;
1190 public static final short IgnoreMarkup = 3;
1193 modes for GetToken()
1195 MixedContent -- for elements which don't accept PCDATA
1196 Preformatted -- white space preserved as is
1197 IgnoreMarkup -- for CDATA elements such as script, style
1200 public Node getToken(short mode)
1206 MutableBoolean isempty = new MutableBoolean();
1211 /* duplicate inlines in preference to pushed text nodes when appropriate */
1212 if (this.token.type != Node.TextNode ||
1213 (this.insert == -1 && this.inode == null))
1215 this.pushed = false;
1220 /* at start of block elements, unclosed inline
1221 elements are inserted into the token stream */
1223 if (this.insert != -1 || this.inode != null)
1224 return insertedToken();
1226 this.lines = this.in.curline;
1227 this.columns = this.in.curcol;
1228 this.waswhite = false;
1230 this.txtstart = this.lexsize;
1231 this.txtend = this.lexsize;
1235 c = this.in.readChar();
1236 if (c == StreamIn.EndOfStream) break;
1237 if (this.insertspace && mode != IgnoreWhitespace)
1239 addCharToLexer(' ');
1240 this.waswhite = true;
1241 this.insertspace = false;
1244 /* treat \r\n as \n and \r as \n */
1248 c = this.in.readChar();
1251 this.in.ungetChar(c);
1260 case LEX_CONTENT: /* element content */
1264 Discard white space if appropriate. Its cheaper
1265 to do this here rather than in parser methods
1266 for elements that don't have mixed content.
1268 if (((map & WHITE) != 0) && (mode == IgnoreWhitespace)
1269 && this.lexsize == this.txtstart + 1)
1272 this.waswhite = false;
1273 this.lines = this.in.curline;
1274 this.columns = this.in.curcol;
1280 this.state = LEX_GT;
1284 if ((map & WHITE) != 0)
1286 /* was previous char white? */
1289 if (mode != Preformatted && mode != IgnoreMarkup)
1292 this.lines = this.in.curline;
1293 this.columns = this.in.curcol;
1296 else /* prev char wasn't white */
1298 this.waswhite = true;
1301 if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
1302 changeChar((byte)' ');
1307 else if (c == '&' && mode != IgnoreMarkup)
1310 /* this is needed to avoid trimming trailing whitespace */
1311 if (mode == IgnoreWhitespace)
1312 mode = MixedContent;
1314 this.waswhite = false;
1317 case LEX_GT: /* < */
1319 /* check for endtag */
1322 c = this.in.readChar();
1323 if (c == StreamIn.EndOfStream)
1325 this.in.ungetChar(c);
1332 if ((map & LETTER) != 0)
1335 this.txtend = this.lexsize;
1336 this.in.ungetChar(c);
1337 this.state = LEX_ENDTAG;
1338 this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */
1339 this.in.curcol -= 2;
1341 /* if some text before the </ return it now */
1342 if (this.txtend > this.txtstart)
1344 /* trim space char before end tag */
1345 if (mode == IgnoreWhitespace && this.lexbuf[this.lexsize - 1] == (byte)' ')
1348 this.txtend = this.lexsize;
1351 this.token = newNode(Node.TextNode,
1358 continue; /* no text so keep going */
1361 /* otherwise treat as CDATA */
1362 this.waswhite = false;
1363 this.state = LEX_CONTENT;
1367 if (mode == IgnoreMarkup)
1369 /* otherwise treat as CDATA */
1370 this.waswhite = false;
1371 this.state = LEX_CONTENT;
1376 look out for comments, doctype or marked sections
1377 this isn't quite right, but its getting there ...
1381 c = this.in.readChar();
1385 c = this.in.readChar();
1389 this.state = LEX_COMMENT; /* comment */
1391 this.txtend = this.lexsize;
1393 /* if some text before < return it now */
1394 if (this.txtend > this.txtstart)
1396 this.token = newNode(Node.TextNode,
1403 this.txtstart = this.lexsize;
1407 Report.warning(this, null, null, Report.MALFORMED_COMMENT);
1409 else if (c == 'd' || c == 'D')
1411 this.state = LEX_DOCTYPE; /* doctype */
1413 this.txtend = this.lexsize;
1414 mode = IgnoreWhitespace;
1416 /* skip until white space or '>' */
1420 c = this.in.readChar();
1422 if (c == StreamIn.EndOfStream || c == '>')
1424 this.in.ungetChar(c);
1430 if ((map & WHITE) == 0)
1433 /* and skip to end of whitespace */
1437 c = this.in.readChar();
1439 if (c == StreamIn.EndOfStream || c == '>')
1441 this.in.ungetChar(c);
1447 if ((map & WHITE) != 0)
1450 this.in.ungetChar(c);
1457 /* if some text before < return it now */
1458 if (this.txtend > this.txtstart)
1460 this.token = newNode(Node.TextNode,
1467 this.txtstart = this.lexsize;
1472 /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
1474 this.state = LEX_SECTION;
1475 this.txtend = this.lexsize;
1477 /* if some text before < return it now */
1478 if (this.txtend > this.txtstart)
1480 this.token = newNode(Node.TextNode,
1487 this.txtstart = this.lexsize;
1491 /* otherwise swallow chars up to and including next '>' */
1494 c = this.in.readChar();
1495 if (c == '>') break;
1498 this.in.ungetChar(c);
1504 this.lexbuf[this.lexsize] = (byte)'\0';
1505 this.state = LEX_CONTENT;
1510 processing instructions
1516 this.state = LEX_PROCINSTR;
1517 this.txtend = this.lexsize;
1519 /* if some text before < return it now */
1520 if (this.txtend > this.txtstart)
1522 this.token = newNode(Node.TextNode,
1529 this.txtstart = this.lexsize;
1533 /* Microsoft ASP's e.g. <% ... server-code ... %> */
1537 this.state = LEX_ASP;
1538 this.txtend = this.lexsize;
1540 /* if some text before < return it now */
1541 if (this.txtend > this.txtstart)
1543 this.token = newNode(Node.TextNode,
1550 this.txtstart = this.lexsize;
1554 /* Netscapes JSTE e.g. <# ... server-code ... #> */
1558 this.state = LEX_JSTE;
1559 this.txtend = this.lexsize;
1561 /* if some text before < return it now */
1562 if (this.txtend > this.txtstart)
1564 this.token = newNode(Node.TextNode,
1571 this.txtstart = this.lexsize;
1577 /* check for start tag */
1578 if ((map & LETTER) != 0)
1580 this.in.ungetChar(c); /* push back letter */
1581 this.lexsize -= 2; /* discard "<" + letter */
1582 this.txtend = this.lexsize;
1583 this.state = LEX_STARTTAG; /* ready to read tag name */
1585 /* if some text before < return it now */
1586 if (this.txtend > this.txtstart)
1588 this.token = newNode(Node.TextNode,
1595 continue; /* no text so keep going */
1598 /* otherwise treat as CDATA */
1599 this.state = LEX_CONTENT;
1600 this.waswhite = false;
1603 case LEX_ENDTAG: /* </letter */
1604 this.txtstart = this.lexsize - 1;
1605 this.in.curcol += 2;
1607 this.token = newNode(Node.EndTag, /* create endtag token */
1611 getString(this.lexbuf,
1613 this.txtend - this.txtstart));
1614 this.lexsize = this.txtstart;
1615 this.txtend = this.txtstart;
1620 c = this.in.readChar();
1622 if (c == StreamIn.EndOfStream)
1626 if (c == StreamIn.EndOfStream)
1628 this.in.ungetChar(c);
1632 this.state = LEX_CONTENT;
1633 this.waswhite = false;
1634 return this.token; /* the endtag token */
1636 case LEX_STARTTAG: /* first letter of tagname */
1637 this.txtstart = this.lexsize - 1; /* set txtstart to first letter */
1639 isempty.value = false;
1641 this.token = newNode((isempty.value ? Node.StartEndTag : Node.StartTag),
1645 getString(this.lexbuf,
1647 this.txtend - this.txtstart));
1649 /* parse attributes, consuming closing ">" */
1653 this.in.ungetChar(c);
1655 attributes = parseAttrs(isempty);
1659 this.token.type = Node.StartEndTag;
1661 this.token.attributes = attributes;
1662 this.lexsize = this.txtstart;
1663 this.txtend = this.txtstart;
1665 /* swallow newline following start tag */
1666 /* special check needed for CRLF sequence */
1667 /* this doesn't apply to empty elements */
1669 if (expectsContent(this.token) ||
1670 this.token.tag == configuration.tt.tagBr)
1673 c = this.in.readChar();
1677 c = this.in.readChar();
1680 this.in.ungetChar(c);
1682 else if (c != '\n' && c != '\f')
1683 this.in.ungetChar(c);
1685 this.waswhite = true; /* to swallow leading whitespace */
1688 this.waswhite = false;
1690 this.state = LEX_CONTENT;
1692 if (this.token.tag == null)
1693 Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
1694 else if (!this.configuration.XmlTags)
1696 this.versions &= this.token.tag.versions;
1698 if ((this.token.tag.versions & Dict.VERS_PROPRIETARY) != 0)
1700 if (!this.configuration.MakeClean && (this.token.tag == configuration.tt.tagNobr ||
1701 this.token.tag == configuration.tt.tagWbr))
1702 Report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
1705 if (this.token.tag.chkattrs != null)
1707 this.token.checkUniqueAttributes(this);
1708 this.token.tag.chkattrs.check(this, this.token);
1711 this.token.checkAttributes(this);
1714 return this.token; /* return start tag */
1716 case LEX_COMMENT: /* seen <!-- so look for --> */
1721 c = this.in.readChar();
1727 end_comment: while (true) {
1728 c = this.in.readChar();
1732 if (badcomment != 0)
1733 Report.warning(this, null, null, Report.MALFORMED_COMMENT);
1735 this.txtend = this.lexsize - 2; // AQ 8Jul2000
1736 this.lexbuf[this.lexsize] = (byte)'\0';
1737 this.state = LEX_CONTENT;
1738 this.waswhite = false;
1739 this.token = newNode(Node.CommentTag,
1744 /* now look for a line break */
1746 c = this.in.readChar();
1750 c = this.in.readChar();
1753 this.token.linebreak = true;
1757 this.token.linebreak = true;
1759 this.in.ungetChar(c);
1764 /* note position of first such error in the comment */
1765 if (badcomment == 0)
1767 this.lines = this.in.curline;
1768 this.columns = this.in.curcol - 3;
1772 if (this.configuration.FixComments)
1773 this.lexbuf[this.lexsize - 2] = (byte)'=';
1777 /* if '-' then look for '>' to end the comment */
1782 /* otherwise continue to look for --> */
1783 this.lexbuf[this.lexsize - 2] = (byte)'=';
1786 case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */
1789 if ((map & WHITE) != 0)
1794 this.waswhite = true;
1797 this.waswhite = false;
1803 this.txtend = this.lexsize;
1804 this.lexbuf[this.lexsize] = (byte)'\0';
1805 this.state = LEX_CONTENT;
1806 this.waswhite = false;
1807 this.token = newNode(Node.DocTypeTag,
1811 /* make a note of the version named by the doctype */
1812 this.doctype = findGivenVersion(this.token);
1815 case LEX_PROCINSTR: /* seen <? so look for '>' */
1816 /* check for PHP preprocessor instructions <?php ... ?> */
1818 if (this.lexsize - this.txtstart == 3)
1820 if ((getString(this.lexbuf, this.txtstart, 3)).equals("php"))
1822 this.state = LEX_PHP;
1827 if (this.configuration.XmlPIs) /* insist on ?> as terminator */
1832 /* now look for '>' */
1833 c = this.in.readChar();
1835 if (c == StreamIn.EndOfStream)
1837 Report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
1838 this.in.ungetChar(c);
1849 this.txtend = this.lexsize;
1850 this.lexbuf[this.lexsize] = (byte)'\0';
1851 this.state = LEX_CONTENT;
1852 this.waswhite = false;
1853 this.token = newNode(Node.ProcInsTag,
1859 case LEX_ASP: /* seen <% so look for "%>" */
1863 /* now look for '>' */
1864 c = this.in.readChar();
1869 this.in.ungetChar(c);
1874 this.txtend = this.lexsize;
1875 this.lexbuf[this.lexsize] = (byte)'\0';
1876 this.state = LEX_CONTENT;
1877 this.waswhite = false;
1878 this.token = newNode(Node.AspTag,
1884 case LEX_JSTE: /* seen <# so look for "#>" */
1888 /* now look for '>' */
1889 c = this.in.readChar();
1894 this.in.ungetChar(c);
1899 this.txtend = this.lexsize;
1900 this.lexbuf[this.lexsize] = (byte)'\0';
1901 this.state = LEX_CONTENT;
1902 this.waswhite = false;
1903 this.token = newNode(Node.JsteTag,
1909 case LEX_PHP: /* seen "<?php" so look for "?>" */
1913 /* now look for '>' */
1914 c = this.in.readChar();
1918 this.in.ungetChar(c);
1923 this.txtend = this.lexsize;
1924 this.lexbuf[this.lexsize] = (byte)'\0';
1925 this.state = LEX_CONTENT;
1926 this.waswhite = false;
1927 this.token = newNode(Node.PhpTag,
1933 case LEX_SECTION: /* seen "<![" so look for "]>" */
1936 if (this.lexsize == (this.txtstart + 6) &&
1937 (getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
1939 this.state = LEX_CDATA;
1948 /* now look for '>' */
1949 c = this.in.readChar();
1953 this.in.ungetChar(c);
1958 this.txtend = this.lexsize;
1959 this.lexbuf[this.lexsize] = (byte)'\0';
1960 this.state = LEX_CONTENT;
1961 this.waswhite = false;
1962 this.token = newNode(Node.SectionTag,
1968 case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
1972 /* now look for ']' */
1973 c = this.in.readChar();
1977 this.in.ungetChar(c);
1981 /* now look for '>' */
1982 c = this.in.readChar();
1986 this.in.ungetChar(c);
1991 this.txtend = this.lexsize;
1992 this.lexbuf[this.lexsize] = (byte)'\0';
1993 this.state = LEX_CONTENT;
1994 this.waswhite = false;
1995 this.token = newNode(Node.CDATATag,
2003 if (this.state == LEX_CONTENT) /* text string */
2005 this.txtend = this.lexsize;
2007 if (this.txtend > this.txtstart)
2009 this.in.ungetChar(c);
2011 if (this.lexbuf[this.lexsize - 1] == (byte)' ')
2014 this.txtend = this.lexsize;
2017 this.token = newNode(Node.TextNode,
2024 else if (this.state == LEX_COMMENT) /* comment */
2026 if (c == StreamIn.EndOfStream)
2027 Report.warning(this, null, null, Report.MALFORMED_COMMENT);
2029 this.txtend = this.lexsize;
2030 this.lexbuf[this.lexsize] = (byte)'\0';
2031 this.state = LEX_CONTENT;
2032 this.waswhite = false;
2033 this.token = newNode(Node.CommentTag,
2044 parser for ASP within start tags
2046 Some people use ASP for to customize attributes
2047 Tidy isn't really well suited to dealing with ASP
2048 This is a workaround for attributes, but won't
2049 deal with the case where the ASP is used to tailor
2050 the attribute value. Here is an example of a work
2051 around for using ASP in attribute values:
2053 href="<%=rsSchool.Fields("ID").Value%>"
2055 where the ASP that generates the attribute value
2056 is masked from Tidy by the quotemarks.
2060 public Node parseAsp()
2065 this.txtstart = this.lexsize;
2069 c = this.in.readChar();
2076 c = this.in.readChar();
2084 this.txtend = this.lexsize;
2086 if (this.txtend > this.txtstart)
2087 asp = newNode(Node.AspTag,
2092 this.txtstart = this.txtend;
2097 PHP is like ASP but is based upon XML
2098 processing instructions, e.g. <?php ... ?>
2100 public Node parsePhp()
2105 this.txtstart = this.lexsize;
2109 c = this.in.readChar();
2116 c = this.in.readChar();
2124 this.txtend = this.lexsize;
2126 if (this.txtend > this.txtstart)
2127 php = newNode(Node.PhpTag,
2132 this.txtstart = this.txtend;
2136 /* consumes the '>' terminating start tags */
2137 public String parseAttribute(MutableBoolean isempty, MutableObject asp,
2141 // int len = 0; Removed by BUGFIX for 126265
2146 asp.setObject(null); /* clear asp pointer */
2147 php.setObject(null); /* clear php pointer */
2148 /* skip white space before the attribute */
2152 c = this.in.readChar();
2156 c = this.in.readChar();
2160 isempty.value = true;
2164 this.in.ungetChar(c);
2174 c = this.in.readChar();
2178 asp.setObject(parseAsp());
2183 php.setObject(parsePhp());
2187 this.in.ungetChar(c);
2188 Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
2192 if (c == '"' || c == '\'')
2194 Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
2198 if (c == StreamIn.EndOfStream)
2200 Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
2201 this.in.ungetChar(c);
2207 if ((map & WHITE) == 0)
2211 start = this.lexsize;
2215 /* but push back '=' for parseValue() */
2216 if (c == '=' || c == '>')
2218 this.in.ungetChar(c);
2222 if (c == '<' || c == StreamIn.EndOfStream)
2224 this.in.ungetChar(c);
2230 if ((map & WHITE) != 0)
2233 /* what should be done about non-namechar characters? */
2234 /* currently these are incorporated into the attr name */
2236 if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
2237 c += (int)('a' - 'A');
2239 // ++len; Removed by BUGFIX for 126265
2242 c = this.in.readChar();
2245 // Following line added by GLP to fix BUG 126265. This is a temporary comment
2246 // and should be removed when Tidy is fixed.
2247 int len = this.lexsize - start;
2248 attr = (len > 0 ? getString(this.lexbuf, start, len) : null);
2249 this.lexsize = start;
2255 invoked when < is seen in place of attribute value
2256 but terminates on whitespace if not ASP, PHP or Tango
2257 this routine recognizes ' and " quoted strings
2259 public int parseServerInstruction()
2261 int c, map, delim = '"';
2262 boolean isrule = false;
2264 c = this.in.readChar();
2267 /* check for ASP, PHP or Tango */
2268 if (c == '%' || c == '?' || c == '@')
2273 c = this.in.readChar();
2275 if (c == StreamIn.EndOfStream)
2283 this.in.ungetChar(c);
2288 /* if not recognized as ASP, PHP or Tango */
2289 /* then also finish value on whitespace */
2294 if ((map & WHITE) != 0)
2304 c = this.in.readChar();
2316 c = this.in.readChar();
2326 /* values start with "=" or " = " etc. */
2327 /* doesn't consume the ">" at end of start tag */
2329 public String parseValue(String name, boolean foldCase,
2330 MutableBoolean isempty, MutableInteger pdelim)
2335 boolean seen_gt = false;
2336 boolean munge = true;
2338 int lastc, delim, quotewarning;
2342 pdelim.value = (int)'"';
2345 Henry Zrepa reports that some folk are using the
2346 embed element with script attributes where newlines
2347 are significant and must be preserved
2349 if (configuration.LiteralAttribs)
2352 /* skip white space before the '=' */
2356 c = this.in.readChar();
2358 if (c == StreamIn.EndOfStream)
2360 this.in.ungetChar(c);
2366 if ((map & WHITE) == 0)
2371 c should be '=' if there is a value
2372 other legal possibilities are white
2378 this.in.ungetChar(c);
2382 /* skip white space after '=' */
2386 c = this.in.readChar();
2388 if (c == StreamIn.EndOfStream)
2390 this.in.ungetChar(c);
2396 if ((map & WHITE) == 0)
2400 /* check for quote marks */
2402 if (c == '"' || c == '\'')
2406 start = this.lexsize;
2408 pdelim.value = parseServerInstruction();
2409 len = this.lexsize - start;
2410 this.lexsize = start;
2411 return (len > 0 ? getString(this.lexbuf, start, len) : null);
2414 this.in.ungetChar(c);
2417 and read the value string
2418 check for quote mark if needed
2422 start = this.lexsize;
2427 lastc = c; /* track last character */
2428 c = this.in.readChar();
2430 if (c == StreamIn.EndOfStream)
2432 Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
2433 this.in.ungetChar(c);
2437 if (delim == (char)0)
2441 this.in.ungetChar(c);
2445 if (c == '"' || c == '\'')
2447 Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
2453 /* this.in.ungetChar(c); */
2454 Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
2459 For cases like <br clear=all/> need to avoid treating /> as
2460 part of the attribute value, however care is needed to avoid
2461 so treating <a href=http://www.acme.com/> in this way, which
2462 would map the <a> tag to <a href="http://www.acme.com"/>
2466 /* peek ahead in case of /> */
2467 c = this.in.readChar();
2470 !AttributeTable.getDefaultAttributeTable().isUrl(name))
2472 isempty.value = true;
2473 this.in.ungetChar(c);
2477 /* unget peeked char */
2478 this.in.ungetChar(c);
2482 else /* delim is '\'' or '"' */
2487 /* treat CRLF, CR and LF as single line break */
2491 c = this.in.readChar();
2493 this.in.ungetChar(c);
2498 if (c == '\n' || c == '<' || c == '>')
2508 parseEntity((short)0);
2513 kludge for JavaScript attribute values
2514 with line continuations in string literals
2518 c = this.in.readChar();
2522 this.in.ungetChar(c);
2529 if ((map & WHITE) != 0)
2531 if (delim == (char)0)
2542 else if (foldCase && (map & UPPERCASE) != 0)
2543 c += (int)('a' - 'A');
2548 if (quotewarning > 10 && seen_gt && munge)
2551 there is almost certainly a missing trailling quote mark
2552 as we have see too many newlines, < or > characters.
2554 an exception is made for Javascript attributes and the
2555 javascript URL scheme which may legitimately include < and >
2557 if (!AttributeTable.getDefaultAttributeTable().isScript(name) &&
2558 !(AttributeTable.getDefaultAttributeTable().isUrl(name) &&
2559 (getString(this.lexbuf, start, 11)).equals("javascript:")))
2560 Report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
2563 len = this.lexsize - start;
2564 this.lexsize = start;
2566 if (len > 0 || delim != 0)
2567 value = getString(this.lexbuf, start, len);
2571 /* note delimiter if given */
2573 pdelim.value = delim;
2575 pdelim.value = (int)'"';
2580 /* attr must be non-null */
2581 public static boolean isValidAttrName(String attr)
2587 /* first character should be a letter */
2591 if (!((map & LETTER) != 0))
2594 /* remaining characters should be namechars */
2595 for( i = 1; i < attr.length(); i++)
2600 if((map & NAMECHAR) != 0)
2609 /* swallows closing '>' */
2611 public AttVal parseAttrs(MutableBoolean isempty)
2614 String attribute, value;
2615 MutableInteger delim = new MutableInteger();
2616 MutableObject asp = new MutableObject();
2617 MutableObject php = new MutableObject();
2621 for (; !endOfInput();)
2623 attribute = parseAttribute(isempty, asp, php);
2625 if (attribute == null)
2627 /* check if attributes are created by ASP markup */
2628 if (asp.getObject() != null)
2630 av = new AttVal(list, null, (Node)asp.getObject(), null,
2636 /* check if attributes are created by PHP markup */
2637 if (php.getObject() != null)
2639 av = new AttVal(list, null, null, (Node)php.getObject(),
2648 value = parseValue(attribute, false, isempty, delim);
2650 if (attribute != null && isValidAttrName(attribute))
2652 av = new AttVal( list, null, null, null,
2653 delim.value, attribute, value );
2655 AttributeTable.getDefaultAttributeTable().findAttribute(av);
2660 av = new AttVal( null, null, null, null,
2661 0, attribute, value );
2662 Report.attrError(this, this.token, value, Report.BAD_ATTRIBUTE_VALUE);
2670 push a copy of an inline node onto stack
2671 but don't push if implicit or OBJECT or APPLET
2672 (implicit tags are ones generated from the istack)
2674 One issue arises with pushing inlines when
2675 the tag is already pushed. For instance:
2680 Shouldn't be mapped to
2682 <p><em>text</em></p>
2683 <p><em><em>more text</em></em>
2685 public void pushInline( Node node )
2692 if (node.tag == null)
2695 if ((node.tag.model & Dict.CM_INLINE) == 0 )
2698 if ((node.tag.model & Dict.CM_OBJECT) != 0)
2701 if (node.tag != configuration.tt.tagFont && isPushed(node))
2704 // make sure there is enough space for the stack
2707 is.element = node.element;
2708 if (node.attributes != null)
2709 is.attributes = cloneAttributes(node.attributes);
2710 this.istack.push( is );
2713 /* pop inline stack */
2714 public void popInline( Node node )
2721 if (node.tag == null)
2724 if ((node.tag.model & Dict.CM_INLINE) == 0)
2727 if ((node.tag.model & Dict.CM_OBJECT) != 0)
2730 // if node is </a> then pop until we find an <a>
2731 if (node.tag == configuration.tt.tagA) {
2733 while (this.istack.size() > 0) {
2734 is = (IStack)this.istack.pop();
2735 if (is.tag == configuration.tt.tagA) {
2740 if (this.insert >= this.istack.size())
2746 if (this.istack.size() > 0) {
2747 is = (IStack)this.istack.pop();
2748 if (this.insert >= this.istack.size())
2753 public boolean isPushed( Node node )
2758 for (i = this.istack.size() - 1; i >= 0; --i) {
2759 is = (IStack)this.istack.elementAt(i);
2760 if (is.tag == node.tag)
2768 This has the effect of inserting "missing" inline
2769 elements around the contents of blocklevel elements
2770 such as P, TD, TH, DIV, PRE etc. This procedure is
2771 called at the start of ParseBlock. when the inline
2772 stack is not empty, as will be the case in:
2774 <i><h1>italic heading</h1></i>
2776 which is then treated as equivalent to
2778 <h1><i>italic heading</i></h1>
2780 This is implemented by setting the lexer into a mode
2781 where it gets tokens from the inline stack rather than
2782 from the input stream.
2784 public int inlineDup( Node node )
2788 n = this.istack.size() - this.istackbase;
2790 this.insert = this.istackbase;
2797 public Node insertedToken()
2803 // this will only be null if inode != null
2804 if (this.insert == -1) {
2810 // is this is the "latest" node then update
2811 // the position, otherwise use current values
2813 if (this.inode == null) {
2814 this.lines = this.in.curline;
2815 this.columns = this.in.curcol;
2818 node = newNode(Node.StartTag,
2821 this.txtend); // GLP: Bugfix 126261. Remove when this change
2822 // is fixed in istack.c in the original Tidy
2823 node.implicit = true;
2824 is = (IStack)this.istack.elementAt( this.insert );
2825 node.element = is.element;
2827 if (is.attributes != null)
2828 node.attributes = cloneAttributes(is.attributes);
2830 // advance lexer to next item on the stack
2833 // and recover state if we have reached the end
2834 if (++n < this.istack.size() ) {
2843 /* AQ: Try this for speed optimization */
2844 public static int wstrcasecmp(String s1, String s2)
2846 return (s1.equalsIgnoreCase(s2) ? 0 : 1);
2849 public static int wstrcaselexcmp(String s1, String s2)
2854 while ( i < s1.length() && i < s2.length() ) {
2856 if ( toLower(c) != toLower( s2.charAt(i) ) ) {
2861 if ( i == s1.length() && i == s2.length() ) {
2863 } else if ( i == s1.length() ) {
2865 } else if ( i == s2.length() ) {
2868 return ( s1.charAt(i) > s2.charAt(i) ? 1 : -1 );
2872 public static boolean wsubstr(String s1, String s2)
2875 int len1 = s1.length();
2876 int len2 = s2.length();
2878 for (i = 0; i <= len1 - len2; ++i)
2880 if (s2.equalsIgnoreCase(s1.substring(i)))
2887 public boolean canPrune(Node element)
2889 if (element.type == Node.TextNode)
2892 if (element.content != null)
2895 if (element.tag == configuration.tt.tagA && element.attributes != null)
2898 if (element.tag == configuration.tt.tagP && !this.configuration.DropEmptyParas)
2901 if (element.tag == null)
2904 if ((element.tag.model & Dict.CM_ROW) != 0)
2907 if (element.tag == configuration.tt.tagApplet)
2910 if (element.tag == configuration.tt.tagObject)
2913 if (element.attributes != null &&
2914 (element.getAttrByName("id") != null ||
2915 element.getAttrByName("name") != null) )
2921 /* duplicate name attribute as an id */
2922 public void fixId(Node node)
2924 AttVal name = node.getAttrByName("name");
2925 AttVal id = node.getAttrByName("id");
2931 if (!id.value.equals(name.value))
2932 Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH);
2934 else if (this.configuration.XmlOut)
2935 node.addAttribute("id", name.value);
2940 defer duplicates when entering a table or other
2941 element where the inlines shouldn't be duplicated
2943 public void deferDup()
2949 /* Private methods and fields */
2951 /* lexer char types */
2952 private static final short DIGIT = 1;
2953 private static final short LETTER = 2;
2954 private static final short NAMECHAR = 4;
2955 private static final short WHITE = 8;
2956 private static final short NEWLINE = 16;
2957 private static final short LOWERCASE = 32;
2958 private static final short UPPERCASE = 64;
2960 /* lexer GetToken states */
2962 private static final short LEX_CONTENT = 0;
2963 private static final short LEX_GT = 1;
2964 private static final short LEX_ENDTAG = 2;
2965 private static final short LEX_STARTTAG = 3;
2966 private static final short LEX_COMMENT = 4;
2967 private static final short LEX_DOCTYPE = 5;
2968 private static final short LEX_PROCINSTR = 6;
2969 private static final short LEX_ENDCOMMENT = 7;
2970 private static final short LEX_CDATA = 8;
2971 private static final short LEX_SECTION = 9;
2972 private static final short LEX_ASP = 10;
2973 private static final short LEX_JSTE = 11;
2974 private static final short LEX_PHP = 12;
2976 /* used to classify chars for lexical purposes */
2977 private static short[] lexmap = new short[128];
2979 private static void mapStr(String str, short code)
2983 for ( int i = 0; i < str.length(); i++ ) {
2984 j = (int)str.charAt(i);
2990 mapStr("\r\n\f", (short)(NEWLINE|WHITE));
2991 mapStr(" \t", WHITE);
2992 mapStr("-.:_", NAMECHAR);
2993 mapStr("0123456789", (short)(DIGIT|NAMECHAR));
2994 mapStr("abcdefghijklmnopqrstuvwxyz", (short)(LOWERCASE|LETTER|NAMECHAR));
2995 mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short)(UPPERCASE|LETTER|NAMECHAR));
2998 private static short MAP( char c )
3000 return ((int)c < 128 ? lexmap[(int)c] : 0);
3003 private static boolean isWhite(char c)
3007 return (m & WHITE) != 0;
3010 private static boolean isDigit(char c)
3016 return (m & DIGIT) != 0;
3019 private static boolean isLetter(char c)
3025 return (m & LETTER) != 0;
3028 private static char toLower(char c)
3032 if ((m & UPPERCASE) != 0)
3033 c = (char)( (int)c + (int)'a' - (int)'A' );
3038 private static char toUpper(char c)
3042 if ((m & LOWERCASE) != 0)
3043 c = (char)( (int)c + (int)'A' - (int)'a' );
3048 public static char foldCase(char c, boolean tocaps, boolean xmlTags)
3058 if ((m & LOWERCASE) != 0)
3059 c = (char)( (int)c + (int)'A' - (int)'a' );
3061 else /* force to lower case */
3063 if ((m & UPPERCASE) != 0)
3064 c = (char)( (int)c + (int)'a' - (int)'A' );
3072 private static class W3CVersionInfo
3079 public W3CVersionInfo( String name,
3085 this.voyagerName = voyagerName;
3086 this.profile = profile;
3091 /* the 3 URIs for the XHTML 1.0 DTDs */
3092 private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
3093 private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
3094 private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
3096 private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
3098 private static Lexer.W3CVersionInfo[] W3CVersion =
3100 new W3CVersionInfo("HTML 4.01",
3103 Dict.VERS_HTML40_STRICT),
3104 new W3CVersionInfo("HTML 4.01 Transitional",
3105 "XHTML 1.0 Transitional",
3107 Dict.VERS_HTML40_LOOSE),
3108 new W3CVersionInfo("HTML 4.01 Frameset",
3109 "XHTML 1.0 Frameset",
3112 new W3CVersionInfo("HTML 4.0",
3115 Dict.VERS_HTML40_STRICT),
3116 new W3CVersionInfo("HTML 4.0 Transitional",
3117 "XHTML 1.0 Transitional",
3119 Dict.VERS_HTML40_LOOSE),
3120 new W3CVersionInfo("HTML 4.0 Frameset",
3121 "XHTML 1.0 Frameset",
3124 new W3CVersionInfo("HTML 3.2",
3125 "XHTML 1.0 Transitional",
3128 new W3CVersionInfo("HTML 2.0",