net.sourceforge.phpeclipse/src/org/w3c/tidy/Lexer.java

   1 /*
   2  * @(#)Lexer.java   1.11 2000/08/16
   3  *
   4  */
   5
   6 package org.w3c.tidy;
   7
   8 /**
   9  *
  10  * Lexer for html parser
  11  *
  12  * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
  13  * See Tidy.java for the copyright notice.
  14  * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
  15  * HTML Tidy Release 4 Aug 2000</a>
  16  *
  17  * @author  Dave Raggett <dsr@w3.org>
  18  * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
  19  * @version 1.0, 1999/05/22
  20  * @version 1.0.1, 1999/05/29
  21  * @version 1.1, 1999/06/18 Java Bean
  22  * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
  23  * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
  24  * @version 1.4, 1999/09/04 DOM support
  25  * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
  26  * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
  27  * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
  28  * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
  29  * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
  30  * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
  31  * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
  32  */
  33
  34 /*
  35   Given a file stream fp it returns a sequence of tokens.
  36
  37      GetToken(fp) gets the next token
  38      UngetToken(fp) provides one level undo
  39
  40   The tags include an attribute list:
  41
  42     - linked list of attribute/value nodes
  43     - each node has 2 null-terminated strings.
  44     - entities are replaced in attribute values
  45
  46   white space is compacted if not in preformatted mode
  47   If not in preformatted mode then leading white space
  48   is discarded and subsequent white space sequences
  49   compacted to single space chars.
  50
  51   If XmlTags is no then Tag names are folded to upper
  52   case and attribute names to lower case.
  53
  54  Not yet done:
  55     -   Doctype subset and marked sections
  56 */
  57
  58 import java.io.PrintWriter;
  59 import java.util.Stack;
  60 import java.util.Vector;
  61
  62 import org.eclipse.core.resources.IFile;
  63 import sun.security.krb5.internal.av;
  64
  65 public class Lexer {
  66
  67     private IFile iFile;
  68     public StreamIn in;   /* file stream */
  69     public PrintWriter errout;   /* error output stream */
  70     public short badAccess; /* for accessibility errors */
  71     public short badLayout; /* for bad style errors */
  72     public short badChars;  /* for bad char encodings */
  73     public short badForm;   /* for mismatched/mispositioned form tags */
  74     public short warnings;  /* count of warnings in this document */
  75     public short errors;    /* count of errors */
  76     public int   lines;     /* lines seen */
  77     public int   columns;   /* at start of current token */
  78     public boolean waswhite;  /* used to collapse contiguous white space */
  79     public boolean pushed;    /* true after token has been pushed back */
  80     public boolean insertspace;   /* when space is moved after end tag */
  81     public boolean excludeBlocks;  /* Netscape compatibility */
  82     public boolean exiled;    /* true if moved out of table */
  83     public boolean isvoyager; /* true if xmlns attribute on html element */
  84     public short versions;  /* bit vector of HTML versions */
  85     public int doctype;    /* version as given by doctype (if any) */
  86     public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
  87     public int txtstart;  /* start of current node */
  88     public int txtend;    /* end of current node */
  89     public short state;     /* state of lexer's finite state machine */
  90     public Node token;
  91
  92     /*
  93       lexer character buffer
  94
  95       parse tree nodes span onto this buffer
  96       which contains the concatenated text
  97       contents of all of the elements.
  98
  99      lexsize must be reset for each file.
 100     */
 101     public byte[] lexbuf;   /* byte buffer of UTF-8 chars */
 102     public int lexlength;   /* allocated */
 103     public int lexsize;     /* used */
 104
 105     /* Inline stack for compatibility with Mosaic */
 106     public Node inode;        /* for deferring text node */
 107     public int insert;        /* for inferring inline tags */
 108     public Stack istack;
 109     public int istackbase;    /* start of frame */
 110
 111     public Style styles;      /* used for cleaning up presentation markup */
 112
 113     public Configuration configuration;
 114     protected int seenBodyEndTag; /* used by parser */
 115     private Vector nodeList;
 116
 117     public Lexer(IFile iFile, StreamIn in, Configuration configuration)
 118     {
 119         this.iFile = iFile;
 120         this.in = in;
 121         this.lines = 1;
 122         this.columns = 1;
 123         this.state = LEX_CONTENT;
 124         this.badAccess = 0;
 125         this.badLayout = 0;
 126         this.badChars = 0;
 127         this.badForm = 0;
 128         this.warnings = 0;
 129         this.errors = 0;
 130         this.waswhite = false;
 131         this.pushed = false;
 132         this.insertspace = false;
 133         this.exiled = false;
 134         this.isvoyager = false;
 135         this.versions = Dict.VERS_EVERYTHING;
 136         this.doctype = Dict.VERS_UNKNOWN;
 137         this.badDoctype = false;
 138         this.txtstart = 0;
 139         this.txtend = 0;
 140         this.token = null;
 141         this.lexbuf =  null;
 142         this.lexlength = 0;
 143         this.lexsize = 0;
 144         this.inode = null;
 145         this.insert = -1;
 146         this.istack = new Stack();
 147         this.istackbase = 0;
 148         this.styles = null;
 149         this.configuration = configuration;
 150         this.seenBodyEndTag = 0;
 151         this.nodeList = new Vector();
 152     }
 153
 154     public IFile getIFile() {
 155       return iFile;
 156     }
 157
 158     public Node newNode()
 159     {
 160         Node node = new Node();
 161         nodeList.addElement(node);
 162         return node;
 163     }
 164
 165     public Node newNode(short type, byte[] textarray, int start, int end)
 166     {
 167         Node node = new Node(type, textarray, start, end);
 168         nodeList.addElement(node);
 169         return node;
 170     }
 171
 172     public Node newNode(short type, byte[] textarray, int start, int end, String element)
 173     {
 174         Node node = new Node(type, textarray, start, end, element, configuration.tt);
 175         nodeList.addElement(node);
 176         return node;
 177     }
 178
 179     public Node cloneNode(Node node)
 180     {
 181         Node cnode = (Node)node.clone();
 182         nodeList.addElement(cnode);
 183         for (AttVal att = cnode.attributes; att != null; att = att.next) {
 184             if (att.asp != null)
 185                 nodeList.addElement(att.asp);
 186             if (att.php != null)
 187                 nodeList.addElement(att.php);
 188         }
 189         return cnode;
 190     }
 191
 192     public AttVal cloneAttributes(AttVal attrs)
 193     {
 194         AttVal cattrs = (AttVal)attrs.clone();
 195         for (AttVal att = cattrs; att != null; att = att.next) {
 196             if (att.asp != null)
 197                 nodeList.addElement(att.asp);
 198             if (att.php != null)
 199                 nodeList.addElement(att.php);
 200         }
 201         return cattrs;
 202     }
 203
 204     protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
 205     {
 206         Node node;
 207         for (int i = 0; i < nodeList.size(); i++) {
 208             node = (Node)(nodeList.elementAt(i));
 209             if (node.textarray == oldtextarray)
 210                 node.textarray = newtextarray;
 211         }
 212     }
 213
 214     /* used for creating preformatted text from Word2000 */
 215     public Node newLineNode()
 216     {
 217         Node node = newNode();
 218
 219         node.textarray = this.lexbuf;
 220         node.start = this.lexsize;
 221         addCharToLexer((int)'\n');
 222         node.end = this.lexsize;
 223         return node;
 224     }
 225
 226     // Should always be able convert to/from UTF-8, so encoding exceptions are
 227     // converted to an Error to avoid adding throws declarations in
 228     // lots of methods.
 229
 230     public static byte[] getBytes(String str) {
 231         try {
 232             return str.getBytes("UTF8");
 233         } catch (java.io.UnsupportedEncodingException e) {
 234             throw new Error("string to UTF-8 conversion failed: " + e.getMessage());
 235         }
 236     }
 237
 238     public static String getString(byte[] bytes, int offset, int length) {
 239         try {
 240             return new String(bytes, offset, length, "UTF8");
 241         } catch (java.io.UnsupportedEncodingException e) {
 242             throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
 243         }
 244     }
 245
 246     public boolean endOfInput()
 247     {
 248         return this.in.isEndOfStream();
 249     }
 250
 251     public void addByte(int c)
 252     {
 253         if (this.lexsize + 1 >= this.lexlength)
 254         {
 255             while (this.lexsize + 1 >= this.lexlength)
 256             {
 257                 if (this.lexlength == 0)
 258                     this.lexlength = 8192;
 259                 else
 260                     this.lexlength = this.lexlength * 2;
 261             }
 262
 263             byte[] temp = this.lexbuf;
 264             this.lexbuf = new byte[ this.lexlength ];
 265             if (temp != null)
 266             {
 267                 System.arraycopy( temp, 0, this.lexbuf, 0, temp.length );
 268                 updateNodeTextArrays(temp, this.lexbuf);
 269             }
 270         }
 271
 272         this.lexbuf[this.lexsize++] = (byte)c;
 273         this.lexbuf[this.lexsize] = (byte)'\0';  /* debug */
 274     }
 275
 276     public void changeChar(byte c)
 277     {
 278         if (this.lexsize > 0)
 279         {
 280             this.lexbuf[this.lexsize-1] = c;
 281         }
 282     }
 283
 284     /* store char c as UTF-8 encoded byte stream */
 285     public void addCharToLexer(int c)
 286     {
 287         if (c < 128)
 288             addByte(c);
 289         else if (c <= 0x7FF)
 290         {
 291             addByte(0xC0 | (c >> 6));
 292             addByte(0x80 | (c & 0x3F));
 293         }
 294         else if (c <= 0xFFFF)
 295         {
 296             addByte(0xE0 | (c >> 12));
 297             addByte(0x80 | ((c >> 6) & 0x3F));
 298             addByte(0x80 | (c & 0x3F));
 299         }
 300         else if (c <= 0x1FFFFF)
 301         {
 302             addByte(0xF0 | (c >> 18));
 303             addByte(0x80 | ((c >> 12) & 0x3F));
 304             addByte(0x80 | ((c >> 6) & 0x3F));
 305             addByte(0x80 | (c & 0x3F));
 306         }
 307         else
 308         {
 309             addByte(0xF8 | (c >> 24));
 310             addByte(0x80 | ((c >> 18) & 0x3F));
 311             addByte(0x80 | ((c >> 12) & 0x3F));
 312             addByte(0x80 | ((c >> 6) & 0x3F));
 313             addByte(0x80 | (c & 0x3F));
 314         }
 315     }
 316
 317     public void addStringToLexer(String str)
 318     {
 319         for ( int i = 0; i < str.length(); i++ ) {
 320             addCharToLexer( (int)str.charAt(i) );
 321         }
 322     }
 323
 324     /*
 325       No longer attempts to insert missing ';' for unknown
 326       enitities unless one was present already, since this
 327       gives unexpected results.
 328
 329       For example:   <a href="something.htm?foo&bar&fred">
 330       was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
 331       rather than:   <a href="something.htm?foo&amp;bar&amp;fred">
 332
 333       My thanks for Maurice Buxton for spotting this.
 334     */
 335     public void parseEntity(short mode)
 336     {
 337         short map;
 338         int start;
 339         boolean first = true;
 340         boolean semicolon = false;
 341         boolean numeric = false;
 342         int c, ch, startcol;
 343         String str;
 344
 345         start = this.lexsize - 1;  /* to start at "&" */
 346         startcol = this.in.curcol - 1;
 347
 348         while (true)
 349         {
 350             c = this.in.readChar();
 351             if (c == StreamIn.EndOfStream) break;
 352             if (c == ';')
 353             {
 354                 semicolon = true;
 355                 break;
 356             }
 357
 358             if (first && c == '#')
 359             {
 360                 addCharToLexer(c);
 361                 first = false;
 362                 numeric = true;
 363                 continue;
 364             }
 365
 366             first = false;
 367             map = MAP((char)c);
 368
 369             /* AQ: Added flag for numeric entities so that numeric entities
 370                with missing semi-colons are recognized.
 371                Eg. "&#114e&#112;..." is recognized as "rep"
 372             */
 373             if (numeric && ((c == 'x') || ((map & DIGIT) != 0)))
 374             {
 375                 addCharToLexer(c);
 376                 continue;
 377             }
 378             if (!numeric && ((map & NAMECHAR) != 0))
 379             {
 380                 addCharToLexer(c);
 381                 continue;
 382             }
 383
 384             /* otherwise put it back */
 385
 386             this.in.ungetChar(c);
 387             break;
 388         }
 389
 390         str = getString( this.lexbuf, start, this.lexsize - start );
 391         ch = EntityTable.getDefaultEntityTable().entityCode( str );
 392
 393         /* deal with unrecognized entities */
 394         if (ch <= 0)
 395         {
 396             /* set error position just before offending chararcter */
 397             this.lines = this.in.curline;
 398             this.columns = startcol;
 399
 400             if (this.lexsize > start +1 )
 401             {
 402                 Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
 403
 404                 if (semicolon)
 405                     addCharToLexer(';');
 406             }
 407             else /* naked & */
 408             {
 409                 Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
 410             }
 411         }
 412         else
 413         {
 414             if (c != ';')    /* issue warning if not terminated by ';' */
 415             {
 416                 /* set error position just before offending chararcter */
 417                 this.lines = this.in.curline;
 418                 this.columns = startcol;
 419                 Report.entityError(this, Report.MISSING_SEMICOLON, str, c);
 420             }
 421
 422             this.lexsize = start;
 423
 424             if (ch == 160 && (mode & Preformatted) != 0)
 425                 ch = ' ';
 426
 427             addCharToLexer(ch);
 428
 429             if (ch == '&' && !this.configuration.QuoteAmpersand)
 430             {
 431                 addCharToLexer('a');
 432                 addCharToLexer('m');
 433                 addCharToLexer('p');
 434                 addCharToLexer(';');
 435             }
 436         }
 437     }
 438
 439     public char parseTagName()
 440     {
 441         short map;
 442         int c;
 443
 444         /* fold case of first char in buffer */
 445
 446         c = this.lexbuf[this.txtstart];
 447         map = MAP((char)c);
 448
 449         if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
 450         {
 451             c += (int)((int)'a' - (int)'A');
 452             this.lexbuf[this.txtstart] = (byte)c;
 453         }
 454
 455         while (true)
 456         {
 457             c = this.in.readChar();
 458             if (c == StreamIn.EndOfStream) break;
 459             map = MAP((char)c);
 460
 461             if ((map & NAMECHAR) == 0)
 462                 break;
 463
 464             /* fold case of subsequent chars */
 465
 466             if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
 467                 c += (int)((int)'a' - (int)'A');
 468
 469             addCharToLexer(c);
 470         }
 471
 472         this.txtend = this.lexsize;
 473         return (char)c;
 474     }
 475
 476     public void addStringLiteral(String str)
 477     {
 478         for ( int i = 0; i < str.length(); i++ ) {
 479             addCharToLexer( (int)str.charAt(i) );
 480         }
 481     }
 482
 483     /* choose what version to use for new doctype */
 484     public short HTMLVersion()
 485     {
 486         short versions;
 487
 488         versions = this.versions;
 489
 490         if ((versions & Dict.VERS_HTML20) != 0)
 491             return Dict.VERS_HTML20;
 492
 493         if ((versions & Dict.VERS_HTML32) != 0)
 494             return Dict.VERS_HTML32;
 495
 496         if ((versions & Dict.VERS_HTML40_STRICT) != 0)
 497             return Dict.VERS_HTML40_STRICT;
 498
 499         if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
 500             return Dict.VERS_HTML40_LOOSE;
 501
 502         if ((versions & Dict.VERS_FRAMES) != 0)
 503             return Dict.VERS_FRAMES;
 504
 505         return Dict.VERS_UNKNOWN;
 506     }
 507
 508     public String HTMLVersionName()
 509     {
 510         short guessed;
 511         int j;
 512
 513         guessed = apparentVersion();
 514
 515         for (j = 0; j < W3CVersion.length; ++j)
 516         {
 517             if (guessed == W3CVersion[j].code)
 518             {
 519                 if (this.isvoyager)
 520                     return W3CVersion[j].voyagerName;
 521
 522                 return W3CVersion[j].name;
 523             }
 524         }
 525
 526         return null;
 527     }
 528
 529     /* add meta element for Tidy */
 530     public boolean addGenerator(Node root)
 531     {
 532         AttVal attval;
 533         Node node;
 534         Node head = root.findHEAD(configuration.tt);
 535
 536         if (head != null)
 537         {
 538             for (node = head.content; node != null; node = node.next)
 539             {
 540                 if (node.tag == configuration.tt.tagMeta)
 541                 {
 542                     attval = node.getAttrByName("name");
 543
 544                     if (attval != null && attval.value != null &&
 545                         Lexer.wstrcasecmp(attval.value, "generator") == 0)
 546                     {
 547                         attval = node.getAttrByName("content");
 548
 549                         if (attval != null && attval.value != null &&
 550                             attval.value.length() >= 9 &&
 551                             Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0)
 552                         {
 553                             return false;
 554                         }
 555                     }
 556                 }
 557             }
 558
 559             node = this.inferredTag("meta");
 560             node.addAttribute("content", "HTML Tidy, see www.w3.org");
 561             node.addAttribute("name", "generator");
 562             Node.insertNodeAtStart(head, node);
 563             return true;
 564         }
 565
 566         return false;
 567     }
 568
 569     /* return true if substring s is in p and isn't all in upper case */
 570     /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
 571     /* len is how many chars to check in p */
 572     private static boolean findBadSubString(String s, String p, int len)
 573     {
 574         int n = s.length();
 575         int i = 0;
 576         String ps;
 577
 578         while (n < len)
 579         {
 580             ps = p.substring(i, i + n);
 581             if (wstrcasecmp(s, ps) == 0)
 582                 return (!ps.equals(s.substring(0, n)));
 583
 584             ++i;
 585             --len;
 586         }
 587
 588         return false;
 589     }
 590
 591     public boolean checkDocTypeKeyWords(Node doctype)
 592     {
 593         int len = doctype.end - doctype.start;
 594         String s = getString(this.lexbuf, doctype.start, len);
 595
 596         return !(
 597             findBadSubString("SYSTEM", s, len) ||
 598             findBadSubString("PUBLIC", s, len) ||
 599             findBadSubString("//DTD", s, len) ||
 600             findBadSubString("//W3C", s, len) ||
 601             findBadSubString("//EN", s, len)
 602             );
 603     }
 604
 605     /* examine <!DOCTYPE> to identify version */
 606     public short findGivenVersion(Node doctype)
 607     {
 608         String p, s;
 609         int i, j;
 610         int len;
 611         String str1;
 612         String str2;
 613
 614         /* if root tag for doctype isn't html give up now */
 615         str1 = getString(this.lexbuf, doctype.start, 5);
 616         if (wstrcasecmp(str1, "html ") != 0)
 617             return 0;
 618
 619         if (!checkDocTypeKeyWords(doctype))
 620             Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
 621
 622         /* give up if all we are given is the system id for the doctype */
 623         str1 = getString(this.lexbuf, doctype.start + 5, 7);
 624         if (wstrcasecmp(str1, "SYSTEM ") == 0)
 625         {
 626             /* but at least ensure the case is correct */
 627             if (!str1.substring(0, 6).equals("SYSTEM"))
 628                 System.arraycopy( getBytes("SYSTEM"), 0,
 629                                   this.lexbuf, doctype.start + 5, 6 );
 630             return 0;  /* unrecognized */
 631         }
 632
 633         if (wstrcasecmp(str1, "PUBLIC ") == 0)
 634         {
 635             if (!str1.substring(0, 6).equals("PUBLIC"))
 636                 System.arraycopy( getBytes("PUBLIC "), 0,
 637                                   this.lexbuf, doctype.start + 5, 6 );
 638         }
 639         else
 640             this.badDoctype = true;
 641
 642         for (i = doctype.start; i < doctype.end; ++i)
 643         {
 644             if (this.lexbuf[i] == (byte)'"')
 645             {
 646                 str1 = getString( this.lexbuf, i + 1, 12 );
 647                 str2 = getString( this.lexbuf, i + 1, 13 );
 648                 if (str1.equals("-//W3C//DTD "))
 649                 {
 650                     /* compute length of identifier e.g. "HTML 4.0 Transitional" */
 651                     for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
 652                     len = j - i - 13;
 653                     p = getString( this.lexbuf, i + 13, len );
 654
 655                     for (j = 1; j < W3CVersion.length; ++j)
 656                     {
 657                         s = W3CVersion[j].name;
 658                         if (len == s.length() && s.equals(p))
 659                             return W3CVersion[j].code;
 660                     }
 661
 662                     /* else unrecognized version */
 663                 }
 664                 else if (str2.equals("-//IETF//DTD "))
 665                 {
 666                     /* compute length of identifier e.g. "HTML 2.0" */
 667                     for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
 668                     len = j - i - 14;
 669
 670                     p = getString( this.lexbuf, i + 14, len );
 671                     s = W3CVersion[0].name;
 672                     if (len == s.length() && s.equals(p))
 673                         return W3CVersion[0].code;
 674
 675                     /* else unrecognized version */
 676                 }
 677                 break;
 678             }
 679         }
 680
 681         return 0;
 682     }
 683
 684     public void fixHTMLNameSpace(Node root, String profile)
 685     {
 686         Node node;
 687         AttVal prev, attr;
 688
 689         for (node = root.content;
 690                 node != null && node.tag != configuration.tt.tagHtml; node = node.next);
 691
 692         if (node != null)
 693         {
 694             prev = null;
 695
 696             for (attr = node.attributes; attr != null; attr = attr.next)
 697             {
 698                 if (attr.attribute.equals("xmlns"))
 699                     break;
 700
 701                 prev = attr;
 702             }
 703
 704             if (attr != null)
 705             {
 706                 if (!attr.value.equals(profile))
 707                 {
 708                     Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
 709                     attr.value = profile;
 710                 }
 711             }
 712             else
 713             {
 714                 attr = new AttVal( node.attributes, null, (int)'"',
 715                                    "xmlns", profile );
 716                 attr.dict =
 717                     AttributeTable.getDefaultAttributeTable().findAttribute( attr );
 718                 node.attributes = attr;
 719             }
 720         }
 721     }
 722
 723     public boolean setXHTMLDocType(Node root)
 724     {
 725         String fpi = " ";
 726         String sysid = "";
 727         String namespace = XHTML_NAMESPACE;
 728         Node doctype;
 729
 730         doctype = root.findDocType();
 731
 732         if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
 733         {
 734             if (doctype != null)
 735                 Node.discardElement(doctype);
 736             return true;
 737         }
 738
 739         if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
 740         {
 741             /* see what flavor of XHTML this document matches */
 742             if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
 743             {  /* use XHTML strict */
 744                 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
 745                 sysid = voyager_strict;
 746             }
 747             else if ((this.versions & Dict.VERS_LOOSE) != 0)
 748             {
 749                 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
 750                 sysid = voyager_loose;
 751             }
 752             else if ((this.versions & Dict.VERS_FRAMES) != 0)
 753             {   /* use XHTML frames */
 754                 fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
 755                 sysid = voyager_frameset;
 756             }
 757             else /* lets assume XHTML transitional */
 758             {
 759                 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
 760                 sysid = voyager_loose;
 761             }
 762         }
 763         else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
 764         {
 765             fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
 766             sysid = voyager_strict;
 767         }
 768         else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
 769         {
 770             fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
 771             sysid = voyager_loose;
 772         }
 773
 774         fixHTMLNameSpace(root, namespace);
 775
 776         if (doctype == null)
 777         {
 778             doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
 779             doctype.next = root.content;
 780             doctype.parent = root;
 781             doctype.prev = null;
 782             root.content = doctype;
 783         }
 784
 785         if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
 786             configuration.docTypeStr != null)
 787         {
 788             fpi = configuration.docTypeStr;
 789             sysid = "";
 790         }
 791
 792         this.txtstart = this.lexsize;
 793         this.txtend = this.lexsize;
 794
 795         /* add public identifier */
 796         addStringLiteral("html PUBLIC ");
 797
 798         /* check if the fpi is quoted or not */
 799         if (fpi.charAt(0) == '"')
 800             addStringLiteral(fpi);
 801         else
 802         {
 803             addStringLiteral("\"");
 804             addStringLiteral(fpi);
 805             addStringLiteral("\"");
 806         }
 807
 808         if (sysid.length() + 6 >= this.configuration.wraplen)
 809             addStringLiteral("\n\"");
 810         else
 811             addStringLiteral("\n    \"");
 812
 813         /* add system identifier */
 814         addStringLiteral(sysid);
 815         addStringLiteral("\"");
 816
 817         this.txtend = this.lexsize;
 818
 819         doctype.start = this.txtstart;
 820         doctype.end = this.txtend;
 821
 822         return false;
 823     }
 824
 825     public short apparentVersion()
 826     {
 827         switch (this.doctype)
 828         {
 829         case Dict.VERS_UNKNOWN:
 830             return HTMLVersion();
 831
 832         case Dict.VERS_HTML20:
 833             if ((this.versions & Dict.VERS_HTML20) != 0)
 834                 return Dict.VERS_HTML20;
 835
 836             break;
 837
 838         case Dict.VERS_HTML32:
 839             if ((this.versions & Dict.VERS_HTML32) != 0)
 840                 return Dict.VERS_HTML32;
 841
 842             break; /* to replace old version by new */
 843
 844         case Dict.VERS_HTML40_STRICT:
 845             if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
 846                 return Dict.VERS_HTML40_STRICT;
 847
 848             break;
 849
 850         case Dict.VERS_HTML40_LOOSE:
 851             if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
 852                 return Dict.VERS_HTML40_LOOSE;
 853
 854             break; /* to replace old version by new */
 855
 856         case Dict.VERS_FRAMES:
 857             if ((this.versions & Dict.VERS_FRAMES) != 0)
 858                 return Dict.VERS_FRAMES;
 859
 860             break;
 861         }
 862
 863         Report.warning(this, null, null, Report.INCONSISTENT_VERSION);
 864         return this.HTMLVersion();
 865     }
 866
 867     /* fixup doctype if missing */
 868     public boolean fixDocType(Node root)
 869     {
 870         Node doctype;
 871         int guessed = Dict.VERS_HTML40_STRICT, i;
 872
 873         if (this.badDoctype)
 874             Report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
 875
 876         if (configuration.XmlOut)
 877             return true;
 878
 879         doctype = root.findDocType();
 880
 881         if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
 882         {
 883             if (doctype != null)
 884                 Node.discardElement(doctype);
 885             return true;
 886         }
 887
 888         if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
 889         {
 890             Node.discardElement(doctype);
 891             doctype = null;
 892             guessed = Dict.VERS_HTML40_STRICT;
 893         }
 894         else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
 895         {
 896             Node.discardElement(doctype);
 897             doctype = null;
 898             guessed = Dict.VERS_HTML40_LOOSE;
 899         }
 900         else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
 901         {
 902             if (doctype != null)
 903             {
 904                 if (this.doctype == Dict.VERS_UNKNOWN)
 905                     return false;
 906
 907                 switch (this.doctype)
 908                 {
 909                 case Dict.VERS_UNKNOWN:
 910                     return false;
 911
 912                 case Dict.VERS_HTML20:
 913                     if ((this.versions & Dict.VERS_HTML20) != 0)
 914                         return true;
 915
 916                     break; /* to replace old version by new */
 917
 918                 case Dict.VERS_HTML32:
 919                     if ((this.versions & Dict.VERS_HTML32) != 0)
 920                         return true;
 921
 922                     break; /* to replace old version by new */
 923
 924                 case Dict.VERS_HTML40_STRICT:
 925                     if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
 926                         return true;
 927
 928                     break; /* to replace old version by new */
 929
 930                 case Dict.VERS_HTML40_LOOSE:
 931                     if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
 932                         return true;
 933
 934                     break; /* to replace old version by new */
 935
 936                 case Dict.VERS_FRAMES:
 937                     if ((this.versions & Dict.VERS_FRAMES) != 0)
 938                         return true;
 939
 940                     break; /* to replace old version by new */
 941                 }
 942
 943                 /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
 944             }
 945
 946             /* choose new doctype */
 947             guessed = HTMLVersion();
 948         }
 949
 950         if (guessed == Dict.VERS_UNKNOWN)
 951             return false;
 952
 953         /* for XML use the Voyager system identifier */
 954         if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager)
 955         {
 956             if (doctype != null)
 957                 Node.discardElement(doctype);
 958
 959             for (i = 0; i < W3CVersion.length; ++i)
 960             {
 961                 if (guessed == W3CVersion[i].code)
 962                 {
 963                     fixHTMLNameSpace(root, W3CVersion[i].profile);
 964                     break;
 965                 }
 966             }
 967
 968             return true;
 969         }
 970
 971         if (doctype == null)
 972         {
 973             doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
 974             doctype.next = root.content;
 975             doctype.parent = root;
 976             doctype.prev = null;
 977             root.content = doctype;
 978         }
 979
 980         this.txtstart = this.lexsize;
 981         this.txtend = this.lexsize;
 982
 983         /* use the appropriate public identifier */
 984         addStringLiteral("html PUBLIC ");
 985
 986         if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
 987             configuration.docTypeStr != null)
 988             addStringLiteral(configuration.docTypeStr);
 989         else if (guessed == Dict.VERS_HTML20)
 990             addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
 991         else
 992         {
 993             addStringLiteral("\"-//W3C//DTD ");
 994
 995             for (i = 0; i < W3CVersion.length; ++i)
 996             {
 997                 if (guessed == W3CVersion[i].code)
 998                 {
 999                     addStringLiteral(W3CVersion[i].name);
1000                     break;
1001                 }
1002             }
1003
1004             addStringLiteral("//EN\"");
1005         }
1006
1007         this.txtend = this.lexsize;
1008
1009         doctype.start = this.txtstart;
1010         doctype.end = this.txtend;
1011
1012         return true;
1013     }
1014
1015     /* ensure XML document starts with <?XML version="1.0"?> */
1016     public boolean fixXMLPI(Node root)
1017     {
1018         Node xml;
1019         int s;
1020
1021         if( root.content != null && root.content.type == Node.ProcInsTag)
1022         {
1023             s = root.content.start;
1024
1025             if (this.lexbuf[s] == (byte)'x' &&
1026                 this.lexbuf[s+1] == (byte)'m' &&
1027                 this.lexbuf[s+2] == (byte)'l')
1028                 return true;
1029         }
1030
1031         xml = newNode(Node.ProcInsTag, this.lexbuf, 0, 0);
1032         xml.next = root.content;
1033
1034         if (root.content != null)
1035         {
1036             root.content.prev = xml;
1037             xml.next = root.content;
1038         }
1039
1040         root.content = xml;
1041
1042         this.txtstart = this.lexsize;
1043         this.txtend = this.lexsize;
1044         addStringLiteral("xml version=\"1.0\"");
1045         if (this.configuration.CharEncoding == Configuration.LATIN1)
1046             addStringLiteral(" encoding=\"ISO-8859-1\"");
1047         this.txtend = this.lexsize;
1048
1049         xml.start = this.txtstart;
1050         xml.end = this.txtend;
1051         return false;
1052     }
1053
1054     public Node inferredTag(String name)
1055     {
1056         Node node;
1057
1058         node = newNode(Node.StartTag,
1059                         this.lexbuf,
1060                         this.txtstart,
1061                         this.txtend,
1062                         name);
1063         node.implicit = true;
1064         return node;
1065     }
1066
1067     public static boolean expectsContent(Node node)
1068     {
1069         if (node.type != Node.StartTag)
1070             return false;
1071
1072         /* unknown element? */
1073         if (node.tag == null)
1074             return true;
1075
1076         if ((node.tag.model & Dict.CM_EMPTY) != 0)
1077             return false;
1078
1079         return true;
1080     }
1081
1082     /*
1083       create a text node for the contents of
1084       a CDATA element like style or script
1085       which ends with </foo> for some foo.
1086     */
1087     public Node getCDATA(Node container)
1088     {
1089         int c, lastc, start, len, i;
1090         String str;
1091         boolean endtag = false;
1092
1093         this.lines = this.in.curline;
1094         this.columns = this.in.curcol;
1095         this.waswhite = false;
1096         this.txtstart = this.lexsize;
1097         this.txtend = this.lexsize;
1098
1099         lastc = (int)'\0';
1100         start = -1;
1101
1102         while (true)
1103         {
1104             c = this.in.readChar();
1105             if (c == StreamIn.EndOfStream) break;
1106             /* treat \r\n as \n and \r as \n */
1107
1108             if (c == (int)'/' && lastc == (int)'<')
1109             {
1110                 if (endtag)
1111                 {
1112                     this.lines = this.in.curline;
1113                     this.columns = this.in.curcol - 3;
1114
1115                     Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1116                 }
1117
1118                 start = this.lexsize + 1;  /* to first letter */
1119                 endtag = true;
1120             }
1121             else if (c == (int)'>' && start >= 0)
1122             {
1123                 len = this.lexsize - start;
1124                 if (len == container.element.length())
1125                 {
1126                     str = getString( this.lexbuf, start, len );
1127                     if (Lexer.wstrcasecmp(str, container.element) == 0)
1128                     {
1129                         this.txtend = start - 2;
1130                         break;
1131                     }
1132                 }
1133
1134                 this.lines = this.in.curline;
1135                 this.columns = this.in.curcol - 3;
1136
1137                 Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1138
1139                 /* if javascript insert backslash before / */
1140
1141                 if (ParserImpl.isJavaScript(container))
1142                 {
1143                     for (i = this.lexsize; i > start-1; --i)
1144                         this.lexbuf[i] = this.lexbuf[i-1];
1145
1146                     this.lexbuf[start-1] = (byte)'\\';
1147                     this.lexsize++;
1148                 }
1149
1150                 start = -1;
1151             }
1152             else if (c == (int)'\r')
1153             {
1154                 c = this.in.readChar();
1155
1156                 if (c != (int)'\n')
1157                     this.in.ungetChar(c);
1158
1159                 c = (int)'\n';
1160             }
1161
1162             addCharToLexer((int)c);
1163             this.txtend = this.lexsize;
1164             lastc = c;
1165         }
1166
1167         if (c == StreamIn.EndOfStream)
1168             Report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
1169
1170         if (this.txtend > this.txtstart)
1171         {
1172             this.token = newNode(Node.TextNode,
1173                                   this.lexbuf,
1174                                   this.txtstart,
1175                                   this.txtend);
1176             return this.token;
1177         }
1178
1179         return null;
1180     }
1181
1182     public void ungetToken()
1183     {
1184         this.pushed = true;
1185     }
1186
1187     public static final short IgnoreWhitespace    = 0;
1188     public static final short MixedContent        = 1;
1189     public static final short Preformatted        = 2;
1190     public static final short IgnoreMarkup        = 3;
1191
1192     /*
1193       modes for GetToken()
1194
1195       MixedContent   -- for elements which don't accept PCDATA
1196       Preformatted       -- white space preserved as is
1197       IgnoreMarkup       -- for CDATA elements such as script, style
1198     */
1199
1200     public Node getToken(short mode)
1201     {
1202         short map;
1203         int c = 0;
1204         int lastc;
1205         int badcomment = 0;
1206         MutableBoolean isempty = new MutableBoolean();
1207         AttVal attributes;
1208
1209         if (this.pushed)
1210         {
1211             /* duplicate inlines in preference to pushed text nodes when appropriate */
1212             if (this.token.type != Node.TextNode ||
1213                 (this.insert == -1 && this.inode == null))
1214             {
1215                 this.pushed = false;
1216                 return this.token;
1217             }
1218         }
1219
1220         /* at start of block elements, unclosed inline
1221            elements are inserted into the token stream */
1222
1223         if (this.insert != -1 || this.inode != null)
1224             return insertedToken();
1225
1226         this.lines = this.in.curline;
1227         this.columns = this.in.curcol;
1228         this.waswhite = false;
1229
1230         this.txtstart = this.lexsize;
1231         this.txtend = this.lexsize;
1232
1233         while (true)
1234         {
1235             c = this.in.readChar();
1236             if (c == StreamIn.EndOfStream) break;
1237             if (this.insertspace && mode != IgnoreWhitespace)
1238             {
1239                 addCharToLexer(' ');
1240                 this.waswhite = true;
1241                 this.insertspace = false;
1242             }
1243
1244             /* treat \r\n as \n and \r as \n */
1245
1246             if (c == '\r')
1247             {
1248                 c = this.in.readChar();
1249
1250                 if (c != '\n')
1251                     this.in.ungetChar(c);
1252
1253                 c = '\n';
1254             }
1255
1256             addCharToLexer(c);
1257
1258             switch (this.state)
1259             {
1260             case LEX_CONTENT:  /* element content */
1261                 map = MAP((char)c);
1262
1263                 /*
1264                  Discard white space if appropriate. Its cheaper
1265                  to do this here rather than in parser methods
1266                  for elements that don't have mixed content.
1267                 */
1268                 if (((map & WHITE) != 0) && (mode == IgnoreWhitespace)
1269                       && this.lexsize == this.txtstart + 1)
1270                 {
1271                     --this.lexsize;
1272                     this.waswhite = false;
1273                     this.lines = this.in.curline;
1274                     this.columns = this.in.curcol;
1275                     continue;
1276                 }
1277
1278                 if (c == '<')
1279                 {
1280                     this.state = LEX_GT;
1281                     continue;
1282                 }
1283
1284                 if ((map & WHITE) != 0)
1285                 {
1286                     /* was previous char white? */
1287                     if (this.waswhite)
1288                     {
1289                         if (mode != Preformatted && mode != IgnoreMarkup)
1290                         {
1291                             --this.lexsize;
1292                             this.lines = this.in.curline;
1293                             this.columns = this.in.curcol;
1294                         }
1295                     }
1296                     else /* prev char wasn't white */
1297                     {
1298                         this.waswhite = true;
1299                         lastc = c;
1300
1301                         if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
1302                             changeChar((byte)' ');
1303                     }
1304
1305                     continue;
1306                 }
1307                 else if (c == '&' && mode != IgnoreMarkup)
1308                     parseEntity(mode);
1309
1310                 /* this is needed to avoid trimming trailing whitespace */
1311                 if (mode == IgnoreWhitespace)
1312                     mode = MixedContent;
1313
1314                 this.waswhite = false;
1315                 continue;
1316
1317             case LEX_GT:  /* < */
1318
1319                 /* check for endtag */
1320                 if (c == '/')
1321                 {
1322                     c = this.in.readChar();
1323                     if (c == StreamIn.EndOfStream)
1324                     {
1325                         this.in.ungetChar(c);
1326                         continue;
1327                     }
1328
1329                     addCharToLexer(c);
1330                     map = MAP((char)c);
1331
1332                     if ((map & LETTER) != 0)
1333                     {
1334                         this.lexsize -= 3;
1335                         this.txtend = this.lexsize;
1336                         this.in.ungetChar(c);
1337                         this.state = LEX_ENDTAG;
1338                         this.lexbuf[this.lexsize] = (byte)'\0';  /* debug */
1339                         this.in.curcol -= 2;
1340
1341                         /* if some text before the </ return it now */
1342                         if (this.txtend > this.txtstart)
1343                         {
1344                             /* trim space char before end tag */
1345                             if (mode == IgnoreWhitespace && this.lexbuf[this.lexsize - 1] == (byte)' ')
1346                             {
1347                                 this.lexsize -= 1;
1348                                 this.txtend = this.lexsize;
1349                             }
1350
1351                             this.token = newNode(Node.TextNode,
1352                                                   this.lexbuf,
1353                                                   this.txtstart,
1354                                                   this.txtend);
1355                             return this.token;
1356                         }
1357
1358                         continue;       /* no text so keep going */
1359                     }
1360
1361                     /* otherwise treat as CDATA */
1362                     this.waswhite = false;
1363                     this.state = LEX_CONTENT;
1364                     continue;
1365                 }
1366
1367                 if (mode == IgnoreMarkup)
1368                 {
1369                     /* otherwise treat as CDATA */
1370                     this.waswhite = false;
1371                     this.state = LEX_CONTENT;
1372                     continue;
1373                 }
1374
1375                 /*
1376                    look out for comments, doctype or marked sections
1377                    this isn't quite right, but its getting there ...
1378                 */
1379                 if (c == '!')
1380                 {
1381                     c = this.in.readChar();
1382
1383                     if (c == '-')
1384                     {
1385                         c = this.in.readChar();
1386
1387                         if (c == '-')
1388                         {
1389                             this.state = LEX_COMMENT;  /* comment */
1390                             this.lexsize -= 2;
1391                             this.txtend = this.lexsize;
1392
1393                             /* if some text before < return it now */
1394                             if (this.txtend > this.txtstart)
1395                             {
1396                                 this.token = newNode(Node.TextNode,
1397                                                       this.lexbuf,
1398                                                       this.txtstart,
1399                                                       this.txtend);
1400                                 return this.token;
1401                             }
1402
1403                             this.txtstart = this.lexsize;
1404                             continue;
1405                         }
1406
1407                         Report.warning(this, null, null, Report.MALFORMED_COMMENT);
1408                     }
1409                     else if (c == 'd' || c == 'D')
1410                     {
1411                         this.state = LEX_DOCTYPE; /* doctype */
1412                         this.lexsize -= 2;
1413                         this.txtend = this.lexsize;
1414                         mode = IgnoreWhitespace;
1415
1416                         /* skip until white space or '>' */
1417
1418                         for (;;)
1419                         {
1420                             c = this.in.readChar();
1421
1422                             if (c == StreamIn.EndOfStream || c == '>')
1423                             {
1424                                 this.in.ungetChar(c);
1425                                 break;
1426                             }
1427
1428                             map = MAP((char)c);
1429
1430                             if ((map & WHITE) == 0)
1431                                 continue;
1432
1433                             /* and skip to end of whitespace */
1434
1435                             for (;;)
1436                             {
1437                                 c = this.in.readChar();
1438
1439                                 if (c == StreamIn.EndOfStream || c == '>')
1440                                 {
1441                                     this.in.ungetChar(c);
1442                                     break;
1443                                 }
1444
1445                                 map = MAP((char)c);
1446
1447                                 if ((map & WHITE) != 0)
1448                                     continue;
1449
1450                                 this.in.ungetChar(c);
1451                                     break;
1452                             }
1453
1454                             break;
1455                         }
1456
1457                         /* if some text before < return it now */
1458                         if (this.txtend > this.txtstart)
1459                         {
1460                                 this.token = newNode(Node.TextNode,
1461                                                       this.lexbuf,
1462                                                       this.txtstart,
1463                                                       this.txtend);
1464                                 return this.token;
1465                         }
1466
1467                         this.txtstart = this.lexsize;
1468                         continue;
1469                     }
1470                     else if (c == '[')
1471                     {
1472                         /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
1473                         this.lexsize -= 2;
1474                         this.state = LEX_SECTION;
1475                         this.txtend = this.lexsize;
1476
1477                         /* if some text before < return it now */
1478                         if (this.txtend > this.txtstart)
1479                         {
1480                                 this.token = newNode(Node.TextNode,
1481                                                       this.lexbuf,
1482                                                       this.txtstart,
1483                                                       this.txtend);
1484                                 return this.token;
1485                         }
1486
1487                         this.txtstart = this.lexsize;
1488                         continue;
1489                     }
1490
1491                     /* otherwise swallow chars up to and including next '>' */
1492                     while (true)
1493                     {
1494                         c = this.in.readChar();
1495                         if (c == '>') break;
1496                         if (c == -1)
1497                         {
1498                             this.in.ungetChar(c);
1499                             break;
1500                         }
1501                     }
1502
1503                     this.lexsize -= 2;
1504                     this.lexbuf[this.lexsize] = (byte)'\0';
1505                     this.state = LEX_CONTENT;
1506                     continue;
1507                 }
1508
1509                 /*
1510                    processing instructions
1511                 */
1512
1513                 if (c == '?')
1514                 {
1515                     this.lexsize -= 2;
1516                     this.state = LEX_PROCINSTR;
1517                     this.txtend = this.lexsize;
1518
1519                     /* if some text before < return it now */
1520                     if (this.txtend > this.txtstart)
1521                     {
1522                         this.token = newNode(Node.TextNode,
1523                                               this.lexbuf,
1524                                               this.txtstart,
1525                                               this.txtend);
1526                         return this.token;
1527                     }
1528
1529                     this.txtstart = this.lexsize;
1530                     continue;
1531                 }
1532
1533                 /* Microsoft ASP's e.g. <% ... server-code ... %> */
1534                 if (c == '%')
1535                 {
1536                     this.lexsize -= 2;
1537                     this.state = LEX_ASP;
1538                     this.txtend = this.lexsize;
1539
1540                     /* if some text before < return it now */
1541                     if (this.txtend > this.txtstart)
1542                     {
1543                         this.token = newNode(Node.TextNode,
1544                                               this.lexbuf,
1545                                               this.txtstart,
1546                                               this.txtend);
1547                         return this.token;
1548                     }
1549
1550                     this.txtstart = this.lexsize;
1551                     continue;
1552                 }
1553
1554                 /* Netscapes JSTE e.g. <# ... server-code ... #> */
1555                 if (c == '#')
1556                 {
1557                     this.lexsize -= 2;
1558                     this.state = LEX_JSTE;
1559                     this.txtend = this.lexsize;
1560
1561                     /* if some text before < return it now */
1562                     if (this.txtend > this.txtstart)
1563                     {
1564                         this.token = newNode(Node.TextNode,
1565                                               this.lexbuf,
1566                                               this.txtstart,
1567                                               this.txtend);
1568                         return this.token;
1569                     }
1570
1571                     this.txtstart = this.lexsize;
1572                     continue;
1573                 }
1574
1575                 map = MAP((char)c);
1576
1577                 /* check for start tag */
1578                 if ((map & LETTER) != 0)
1579                 {
1580                     this.in.ungetChar(c);     /* push back letter */
1581                     this.lexsize -= 2;      /* discard "<" + letter */
1582                     this.txtend = this.lexsize;
1583                     this.state = LEX_STARTTAG;         /* ready to read tag name */
1584
1585                     /* if some text before < return it now */
1586                     if (this.txtend > this.txtstart)
1587                     {
1588                         this.token = newNode(Node.TextNode,
1589                                               this.lexbuf,
1590                                               this.txtstart,
1591                                               this.txtend);
1592                         return this.token;
1593                     }
1594
1595                     continue;       /* no text so keep going */
1596                 }
1597
1598                 /* otherwise treat as CDATA */
1599                 this.state = LEX_CONTENT;
1600                 this.waswhite = false;
1601                 continue;
1602
1603             case LEX_ENDTAG:  /* </letter */
1604                 this.txtstart = this.lexsize - 1;
1605                 this.in.curcol += 2;
1606                 c = parseTagName();
1607                 this.token = newNode(Node.EndTag, /* create endtag token */
1608                                       this.lexbuf,
1609                                       this.txtstart,
1610                                       this.txtend,
1611                                       getString(this.lexbuf,
1612                                                  this.txtstart,
1613                                                  this.txtend - this.txtstart));
1614                 this.lexsize = this.txtstart;
1615                 this.txtend = this.txtstart;
1616
1617                 /* skip to '>' */
1618                 while (c != '>')
1619                 {
1620                     c = this.in.readChar();
1621
1622                     if (c == StreamIn.EndOfStream)
1623                         break;
1624                 }
1625
1626                 if (c == StreamIn.EndOfStream)
1627                 {
1628                     this.in.ungetChar(c);
1629                     continue;
1630                 }
1631
1632                 this.state = LEX_CONTENT;
1633                 this.waswhite = false;
1634                 return this.token;  /* the endtag token */
1635
1636             case LEX_STARTTAG: /* first letter of tagname */
1637                 this.txtstart = this.lexsize - 1; /* set txtstart to first letter */
1638                 c = parseTagName();
1639                 isempty.value = false;
1640                 attributes = null;
1641                 this.token = newNode((isempty.value ? Node.StartEndTag : Node.StartTag),
1642                                       this.lexbuf,
1643                                       this.txtstart,
1644                                       this.txtend,
1645                                       getString(this.lexbuf,
1646                                                  this.txtstart,
1647                                                  this.txtend - this.txtstart));
1648
1649                 /* parse attributes, consuming closing ">" */
1650                 if (c != '>')
1651                 {
1652                     if (c == '/')
1653                         this.in.ungetChar(c);
1654
1655                     attributes = parseAttrs(isempty);
1656                 }
1657
1658                 if (isempty.value)
1659                     this.token.type = Node.StartEndTag;
1660
1661                 this.token.attributes = attributes;
1662                 this.lexsize = this.txtstart;
1663                 this.txtend = this.txtstart;
1664
1665                 /* swallow newline following start tag */
1666                 /* special check needed for CRLF sequence */
1667                 /* this doesn't apply to empty elements */
1668
1669                 if (expectsContent(this.token) ||
1670                     this.token.tag == configuration.tt.tagBr)
1671                 {
1672
1673                     c = this.in.readChar();
1674
1675                     if (c == '\r')
1676                     {
1677                         c = this.in.readChar();
1678
1679                         if (c != '\n')
1680                             this.in.ungetChar(c);
1681                     }
1682                     else if (c != '\n' && c != '\f')
1683                         this.in.ungetChar(c);
1684
1685                     this.waswhite = true;  /* to swallow leading whitespace */
1686                 }
1687                 else
1688                     this.waswhite = false;
1689
1690                 this.state = LEX_CONTENT;
1691
1692                 if (this.token.tag == null)
1693                     Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
1694                 else if (!this.configuration.XmlTags)
1695                 {
1696                     this.versions &= this.token.tag.versions;
1697
1698                     if ((this.token.tag.versions & Dict.VERS_PROPRIETARY) != 0)
1699                     {
1700                         if (!this.configuration.MakeClean && (this.token.tag == configuration.tt.tagNobr ||
1701                                                 this.token.tag == configuration.tt.tagWbr))
1702                             Report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
1703                     }
1704
1705                     if (this.token.tag.chkattrs != null)
1706                     {
1707                         this.token.checkUniqueAttributes(this);
1708                         this.token.tag.chkattrs.check(this, this.token);
1709                     }
1710                     else
1711                         this.token.checkAttributes(this);
1712                 }
1713
1714                 return this.token;  /* return start tag */
1715
1716             case LEX_COMMENT:  /* seen <!-- so look for --> */
1717
1718                 if (c != '-')
1719                     continue;
1720
1721                 c = this.in.readChar();
1722                 addCharToLexer(c);
1723
1724                 if (c != '-')
1725                     continue;
1726
1727                 end_comment: while (true) {
1728                     c = this.in.readChar();
1729
1730                     if (c == '>')
1731                     {
1732                         if (badcomment != 0)
1733                             Report.warning(this, null, null, Report.MALFORMED_COMMENT);
1734
1735                         this.txtend = this.lexsize - 2; // AQ 8Jul2000
1736                         this.lexbuf[this.lexsize] = (byte)'\0';
1737                         this.state = LEX_CONTENT;
1738                         this.waswhite = false;
1739                         this.token = newNode(Node.CommentTag,
1740                                               this.lexbuf,
1741                                               this.txtstart,
1742                                               this.txtend);
1743
1744                         /* now look for a line break */
1745
1746                         c = this.in.readChar();
1747
1748                         if (c == '\r')
1749                         {
1750                             c = this.in.readChar();
1751
1752                             if (c != '\n')
1753                                 this.token.linebreak = true;
1754                         }
1755
1756                         if (c == '\n')
1757                             this.token.linebreak = true;
1758                         else
1759                             this.in.ungetChar(c);
1760
1761                         return this.token;
1762                     }
1763
1764                     /* note position of first such error in the comment */
1765                     if (badcomment == 0)
1766                     {
1767                         this.lines = this.in.curline;
1768                         this.columns = this.in.curcol - 3;
1769                     }
1770
1771                     badcomment++;
1772                     if (this.configuration.FixComments)
1773                         this.lexbuf[this.lexsize - 2] = (byte)'=';
1774
1775                     addCharToLexer(c);
1776
1777                     /* if '-' then look for '>' to end the comment */
1778                     if (c != '-')
1779                         break end_comment;
1780
1781                 }
1782                 /* otherwise continue to look for --> */
1783                 this.lexbuf[this.lexsize - 2] = (byte)'=';
1784                 continue;
1785
1786             case LEX_DOCTYPE:  /* seen <!d so look for '>' munging whitespace */
1787                 map = MAP((char)c);
1788
1789                 if ((map & WHITE) != 0)
1790                 {
1791                     if (this.waswhite)
1792                         this.lexsize -= 1;
1793
1794                     this.waswhite = true;
1795                 }
1796                 else
1797                     this.waswhite = false;
1798
1799                 if (c != '>')
1800                     continue;
1801
1802                 this.lexsize -= 1;
1803                 this.txtend = this.lexsize;
1804                 this.lexbuf[this.lexsize] = (byte)'\0';
1805                 this.state = LEX_CONTENT;
1806                 this.waswhite = false;
1807                 this.token = newNode(Node.DocTypeTag,
1808                                       this.lexbuf,
1809                                       this.txtstart,
1810                                       this.txtend);
1811                 /* make a note of the version named by the doctype */
1812                 this.doctype = findGivenVersion(this.token);
1813                 return this.token;
1814
1815             case LEX_PROCINSTR:  /* seen <? so look for '>' */
1816                 /* check for PHP preprocessor instructions <?php ... ?> */
1817
1818                 if  (this.lexsize - this.txtstart == 3)
1819                 {
1820                     if ((getString(this.lexbuf, this.txtstart, 3)).equals("php"))
1821                     {
1822                         this.state = LEX_PHP;
1823                         continue;
1824                     }
1825                 }
1826
1827                 if (this.configuration.XmlPIs)  /* insist on ?> as terminator */
1828                 {
1829                     if (c != '?')
1830                         continue;
1831
1832                     /* now look for '>' */
1833                     c = this.in.readChar();
1834
1835                     if (c == StreamIn.EndOfStream)
1836                     {
1837                         Report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
1838                         this.in.ungetChar(c);
1839                         continue;
1840                     }
1841
1842                     addCharToLexer(c);
1843                 }
1844
1845                 if (c != '>')
1846                     continue;
1847
1848                 this.lexsize -= 1;
1849                 this.txtend = this.lexsize;
1850                 this.lexbuf[this.lexsize] = (byte)'\0';
1851                 this.state = LEX_CONTENT;
1852                 this.waswhite = false;
1853                 this.token = newNode(Node.ProcInsTag,
1854                                       this.lexbuf,
1855                                       this.txtstart,
1856                                       this.txtend);
1857                 return this.token;
1858
1859             case LEX_ASP:  /* seen <% so look for "%>" */
1860                 if (c != '%')
1861                     continue;
1862
1863                 /* now look for '>' */
1864                 c = this.in.readChar();
1865
1866
1867                 if (c != '>')
1868                 {
1869                     this.in.ungetChar(c);
1870                     continue;
1871                 }
1872
1873                 this.lexsize -= 1;
1874                 this.txtend = this.lexsize;
1875                 this.lexbuf[this.lexsize] = (byte)'\0';
1876                 this.state = LEX_CONTENT;
1877                 this.waswhite = false;
1878                 this.token = newNode(Node.AspTag,
1879                                       this.lexbuf,
1880                                       this.txtstart,
1881                                       this.txtend);
1882                 return this.token;
1883
1884             case LEX_JSTE:  /* seen <# so look for "#>" */
1885                 if (c != '#')
1886                     continue;
1887
1888                 /* now look for '>' */
1889                 c = this.in.readChar();
1890
1891
1892                 if (c != '>')
1893                 {
1894                     this.in.ungetChar(c);
1895                     continue;
1896                 }
1897
1898                 this.lexsize -= 1;
1899                 this.txtend = this.lexsize;
1900                 this.lexbuf[this.lexsize] = (byte)'\0';
1901                 this.state = LEX_CONTENT;
1902                 this.waswhite = false;
1903                 this.token = newNode(Node.JsteTag,
1904                                       this.lexbuf,
1905                                       this.txtstart,
1906                                       this.txtend);
1907                 return this.token;
1908
1909             case LEX_PHP: /* seen "<?php" so look for "?>" */
1910                 if (c != '?')
1911                     continue;
1912
1913                 /* now look for '>' */
1914                 c = this.in.readChar();
1915
1916                 if (c != '>')
1917                 {
1918                     this.in.ungetChar(c);
1919                     continue;
1920                 }
1921
1922                 this.lexsize -= 1;
1923                 this.txtend = this.lexsize;
1924                 this.lexbuf[this.lexsize] = (byte)'\0';
1925                 this.state = LEX_CONTENT;
1926                 this.waswhite = false;
1927                 this.token = newNode(Node.PhpTag,
1928                                       this.lexbuf,
1929                                       this.txtstart,
1930                                       this.txtend);
1931                 return this.token;
1932
1933             case LEX_SECTION: /* seen "<![" so look for "]>" */
1934                 if (c == '[')
1935                 {
1936                     if (this.lexsize == (this.txtstart + 6) &&
1937                         (getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
1938                     {
1939                         this.state = LEX_CDATA;
1940                         this.lexsize -= 6;
1941                         continue;
1942                     }
1943                 }
1944
1945                 if (c != ']')
1946                     continue;
1947
1948                 /* now look for '>' */
1949                 c = this.in.readChar();
1950
1951                 if (c != '>')
1952                 {
1953                     this.in.ungetChar(c);
1954                     continue;
1955                 }
1956
1957                 this.lexsize -= 1;
1958                 this.txtend = this.lexsize;
1959                 this.lexbuf[this.lexsize] = (byte)'\0';
1960                 this.state = LEX_CONTENT;
1961                 this.waswhite = false;
1962                 this.token = newNode(Node.SectionTag,
1963                                       this.lexbuf,
1964                                       this.txtstart,
1965                                       this.txtend);
1966                 return this.token;
1967
1968             case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
1969                 if (c != ']')
1970                     continue;
1971
1972                 /* now look for ']' */
1973                 c = this.in.readChar();
1974
1975                 if (c != ']')
1976                 {
1977                     this.in.ungetChar(c);
1978                     continue;
1979                 }
1980
1981                 /* now look for '>' */
1982                 c = this.in.readChar();
1983
1984                 if (c != '>')
1985                 {
1986                     this.in.ungetChar(c);
1987                     continue;
1988                 }
1989
1990                 this.lexsize -= 1;
1991                 this.txtend = this.lexsize;
1992                 this.lexbuf[this.lexsize] = (byte)'\0';
1993                 this.state = LEX_CONTENT;
1994                 this.waswhite = false;
1995                 this.token = newNode(Node.CDATATag,
1996                                       this.lexbuf,
1997                                       this.txtstart,
1998                                       this.txtend);
1999                 return this.token;
2000             }
2001         }
2002
2003         if (this.state == LEX_CONTENT)  /* text string */
2004         {
2005             this.txtend = this.lexsize;
2006
2007             if (this.txtend > this.txtstart)
2008             {
2009                 this.in.ungetChar(c);
2010
2011                 if (this.lexbuf[this.lexsize - 1] == (byte)' ')
2012                 {
2013                     this.lexsize -= 1;
2014                     this.txtend = this.lexsize;
2015                 }
2016
2017                 this.token = newNode(Node.TextNode,
2018                                       this.lexbuf,
2019                                       this.txtstart,
2020                                       this.txtend);
2021                 return this.token;
2022             }
2023         }
2024         else if (this.state == LEX_COMMENT) /* comment */
2025         {
2026             if (c == StreamIn.EndOfStream)
2027                 Report.warning(this, null, null, Report.MALFORMED_COMMENT);
2028
2029             this.txtend = this.lexsize;
2030             this.lexbuf[this.lexsize] = (byte)'\0';
2031             this.state = LEX_CONTENT;
2032             this.waswhite = false;
2033             this.token = newNode(Node.CommentTag,
2034                                   this.lexbuf,
2035                                   this.txtstart,
2036                                   this.txtend);
2037             return this.token;
2038         }
2039
2040         return null;
2041     }
2042
2043     /*
2044      parser for ASP within start tags
2045
2046      Some people use ASP for to customize attributes
2047      Tidy isn't really well suited to dealing with ASP
2048      This is a workaround for attributes, but won't
2049      deal with the case where the ASP is used to tailor
2050      the attribute value. Here is an example of a work
2051      around for using ASP in attribute values:
2052
2053       href="<%=rsSchool.Fields("ID").Value%>"
2054
2055      where the ASP that generates the attribute value
2056      is masked from Tidy by the quotemarks.
2057
2058     */
2059
2060     public Node parseAsp()
2061     {
2062         int c;
2063         Node asp = null;
2064
2065         this.txtstart = this.lexsize;
2066
2067         for (;;)
2068         {
2069             c = this.in.readChar();
2070             addCharToLexer(c);
2071
2072
2073             if (c != '%')
2074                 continue;
2075
2076             c = this.in.readChar();
2077             addCharToLexer(c);
2078
2079             if (c == '>')
2080                 break;
2081         }
2082
2083         this.lexsize -= 2;
2084         this.txtend = this.lexsize;
2085
2086         if (this.txtend > this.txtstart)
2087             asp = newNode(Node.AspTag,
2088                            this.lexbuf,
2089                            this.txtstart,
2090                            this.txtend);
2091
2092         this.txtstart = this.txtend;
2093         return asp;
2094     }
2095
2096     /*
2097      PHP is like ASP but is based upon XML
2098      processing instructions, e.g. <?php ... ?>
2099     */
2100     public Node parsePhp()
2101     {
2102         int c;
2103         Node php = null;
2104
2105         this.txtstart = this.lexsize;
2106
2107         for (;;)
2108         {
2109             c = this.in.readChar();
2110             addCharToLexer(c);
2111
2112
2113             if (c != '?')
2114                 continue;
2115
2116             c = this.in.readChar();
2117             addCharToLexer(c);
2118
2119             if (c == '>')
2120                 break;
2121         }
2122
2123         this.lexsize -= 2;
2124         this.txtend = this.lexsize;
2125
2126         if (this.txtend > this.txtstart)
2127             php = newNode(Node.PhpTag,
2128                            this.lexbuf,
2129                            this.txtstart,
2130                            this.txtend);
2131
2132         this.txtstart = this.txtend;
2133         return php;
2134     }
2135
2136     /* consumes the '>' terminating start tags */
2137     public String parseAttribute(MutableBoolean isempty, MutableObject asp,
2138                                  MutableObject php)
2139     {
2140         int start = 0;
2141         // int len = 0;   Removed by BUGFIX for 126265
2142         short map;
2143         String attr;
2144         int c = 0;
2145
2146         asp.setObject(null);  /* clear asp pointer */
2147         php.setObject(null);  /* clear php pointer */
2148         /* skip white space before the attribute */
2149
2150         for (;;)
2151         {
2152             c = this.in.readChar();
2153
2154             if (c == '/')
2155             {
2156                 c = this.in.readChar();
2157
2158                 if (c == '>')
2159                 {
2160                     isempty.value = true;
2161                     return null;
2162                 }
2163
2164                 this.in.ungetChar(c);
2165                 c = '/';
2166                 break;
2167             }
2168
2169             if (c == '>')
2170                 return null;
2171
2172             if (c =='<')
2173             {
2174                 c = this.in.readChar();
2175
2176                 if (c == '%')
2177                 {
2178                     asp.setObject(parseAsp());
2179                     return null;
2180                 }
2181                 else if (c == '?')
2182                 {
2183                     php.setObject(parsePhp());
2184                     return null;
2185                 }
2186
2187                 this.in.ungetChar(c);
2188                 Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
2189                 return null;
2190             }
2191
2192             if (c == '"' || c == '\'')
2193             {
2194                 Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
2195                 continue;
2196             }
2197
2198             if (c == StreamIn.EndOfStream)
2199             {
2200                 Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
2201                 this.in.ungetChar(c);
2202                 return null;
2203             }
2204
2205             map = MAP((char)c);
2206
2207             if ((map & WHITE) == 0)
2208                 break;
2209         }
2210
2211         start = this.lexsize;
2212
2213         for (;;)
2214         {
2215          /* but push back '=' for parseValue() */
2216             if (c == '=' || c == '>')
2217             {
2218                 this.in.ungetChar(c);
2219                 break;
2220             }
2221
2222             if (c == '<' || c == StreamIn.EndOfStream)
2223             {
2224                 this.in.ungetChar(c);
2225                 break;
2226             }
2227
2228             map = MAP((char)c);
2229
2230             if ((map & WHITE) != 0)
2231                 break;
2232
2233          /* what should be done about non-namechar characters? */
2234          /* currently these are incorporated into the attr name */
2235
2236             if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
2237                 c += (int)('a' - 'A');
2238
2239             //  ++len;    Removed by BUGFIX for 126265
2240             addCharToLexer(c);
2241
2242             c = this.in.readChar();
2243         }
2244
2245         // Following line added by GLP to fix BUG 126265.  This is a temporary comment
2246         // and should be removed when Tidy is fixed.
2247         int len = this.lexsize - start;
2248         attr = (len > 0 ? getString(this.lexbuf, start, len) : null);
2249         this.lexsize = start;
2250
2251         return attr;
2252     }
2253
2254     /*
2255      invoked when < is seen in place of attribute value
2256      but terminates on whitespace if not ASP, PHP or Tango
2257      this routine recognizes ' and " quoted strings
2258     */
2259     public int parseServerInstruction()
2260     {
2261         int c, map, delim = '"';
2262         boolean isrule = false;
2263
2264         c = this.in.readChar();
2265         addCharToLexer(c);
2266
2267         /* check for ASP, PHP or Tango */
2268         if (c == '%' || c == '?' || c == '@')
2269             isrule = true;
2270
2271         for (;;)
2272         {
2273             c = this.in.readChar();
2274
2275             if (c == StreamIn.EndOfStream)
2276                 break;
2277
2278             if (c == '>')
2279             {
2280                 if (isrule)
2281                     addCharToLexer(c);
2282                 else
2283                     this.in.ungetChar(c);
2284
2285                 break;
2286             }
2287
2288             /* if not recognized as ASP, PHP or Tango */
2289             /* then also finish value on whitespace */
2290             if (!isrule)
2291             {
2292                 map = MAP((char)c);
2293
2294                 if ((map & WHITE) != 0)
2295                     break;
2296             }
2297
2298             addCharToLexer(c);
2299
2300             if (c == '"')
2301             {
2302                 do
2303                 {
2304                     c = this.in.readChar();
2305                     addCharToLexer(c);
2306                 }
2307                 while (c != '"');
2308                 delim = '\'';
2309                 continue;
2310             }
2311
2312             if (c == '\'')
2313             {
2314                 do
2315                 {
2316                     c = this.in.readChar();
2317                     addCharToLexer(c);
2318                 }
2319                 while (c != '\'');
2320             }
2321         }
2322
2323         return delim;
2324     }
2325
2326     /* values start with "=" or " = " etc. */
2327     /* doesn't consume the ">" at end of start tag */
2328
2329     public String parseValue(String name, boolean foldCase,
2330                              MutableBoolean isempty, MutableInteger pdelim)
2331     {
2332         int len = 0;
2333         int start;
2334         short map;
2335         boolean seen_gt = false;
2336         boolean munge = true;
2337         int c = 0;
2338         int lastc, delim, quotewarning;
2339         String value;
2340
2341         delim = 0;
2342         pdelim.value = (int)'"';
2343
2344         /*
2345          Henry Zrepa reports that some folk are using the
2346          embed element with script attributes where newlines
2347          are significant and must be preserved
2348         */
2349         if (configuration.LiteralAttribs)
2350             munge = false;
2351
2352         /* skip white space before the '=' */
2353
2354         for (;;)
2355         {
2356             c = this.in.readChar();
2357
2358             if (c == StreamIn.EndOfStream)
2359             {
2360                 this.in.ungetChar(c);
2361                 break;
2362             }
2363
2364             map = MAP((char)c);
2365
2366             if ((map & WHITE) == 0)
2367                break;
2368         }
2369
2370     /*
2371       c should be '=' if there is a value
2372       other legal possibilities are white
2373       space, '/' and '>'
2374     */
2375
2376         if (c != '=')
2377         {
2378             this.in.ungetChar(c);
2379             return null;
2380         }
2381
2382      /* skip white space after '=' */
2383
2384         for (;;)
2385         {
2386             c = this.in.readChar();
2387
2388             if (c == StreamIn.EndOfStream)
2389             {
2390                 this.in.ungetChar(c);
2391                 break;
2392             }
2393
2394             map = MAP((char)c);
2395
2396             if ((map & WHITE) == 0)
2397                break;
2398         }
2399
2400      /* check for quote marks */
2401
2402         if (c == '"' || c == '\'')
2403             delim = c;
2404         else if (c == '<')
2405         {
2406             start = this.lexsize;
2407             addCharToLexer(c);
2408             pdelim.value = parseServerInstruction();
2409             len = this.lexsize - start;
2410             this.lexsize = start;
2411             return (len > 0 ? getString(this.lexbuf, start, len) : null);
2412         }
2413         else
2414             this.in.ungetChar(c);
2415
2416      /*
2417        and read the value string
2418        check for quote mark if needed
2419      */
2420
2421         quotewarning = 0;
2422         start = this.lexsize;
2423         c = '\0';
2424
2425         for (;;)
2426         {
2427             lastc = c;  /* track last character */
2428             c = this.in.readChar();
2429
2430             if (c == StreamIn.EndOfStream)
2431             {
2432                 Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
2433                 this.in.ungetChar(c);
2434                 break;
2435             }
2436
2437             if (delim == (char)0)
2438             {
2439                 if (c == '>')
2440                 {
2441                     this.in.ungetChar(c);
2442                     break;
2443                 }
2444
2445                 if (c == '"' || c == '\'')
2446                 {
2447                     Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
2448                     break;
2449                 }
2450
2451                 if (c == '<')
2452                 {
2453                     /* this.in.ungetChar(c); */
2454                     Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
2455                     /* break; */
2456                 }
2457
2458                 /*
2459                  For cases like <br clear=all/> need to avoid treating /> as
2460                  part of the attribute value, however care is needed to avoid
2461                  so treating <a href=http://www.acme.com/> in this way, which
2462                  would map the <a> tag to <a href="http://www.acme.com"/>
2463                 */
2464                 if (c == '/')
2465                 {
2466                     /* peek ahead in case of /> */
2467                     c = this.in.readChar();
2468
2469                     if (c == '>' &&
2470                         !AttributeTable.getDefaultAttributeTable().isUrl(name))
2471                     {
2472                         isempty.value = true;
2473                         this.in.ungetChar(c);
2474                         break;
2475                     }
2476
2477                     /* unget peeked char */
2478                     this.in.ungetChar(c);
2479                     c = '/';
2480                 }
2481             }
2482             else  /* delim is '\'' or '"' */
2483             {
2484                 if (c == delim)
2485                     break;
2486
2487                 /* treat CRLF, CR and LF as single line break */
2488
2489                 if (c == '\r')
2490                 {
2491                     c = this.in.readChar();
2492                     if (c != '\n')
2493                         this.in.ungetChar(c);
2494
2495                     c = '\n';
2496                 }
2497
2498                 if (c == '\n' || c == '<' || c == '>')
2499                     ++quotewarning;
2500
2501                 if (c == '>')
2502                     seen_gt = true;
2503             }
2504
2505             if (c == '&')
2506             {
2507                 addCharToLexer(c);
2508                 parseEntity((short)0);
2509                 continue;
2510             }
2511
2512             /*
2513              kludge for JavaScript attribute values
2514              with line continuations in string literals
2515             */
2516             if (c == '\\')
2517             {
2518                 c = this.in.readChar();
2519
2520                 if (c != '\n')
2521                 {
2522                     this.in.ungetChar(c);
2523                     c = '\\';
2524                 }
2525             }
2526
2527             map = MAP((char)c);
2528
2529             if ((map & WHITE) != 0)
2530             {
2531                 if (delim == (char)0)
2532                     break;
2533
2534                 if (munge)
2535                 {
2536                     c = ' ';
2537
2538                     if (lastc == ' ')
2539                         continue;
2540                 }
2541             }
2542             else if (foldCase && (map & UPPERCASE) != 0)
2543                 c += (int)('a' - 'A');
2544
2545             addCharToLexer(c);
2546         }
2547
2548         if (quotewarning > 10 && seen_gt && munge)
2549         {
2550             /*
2551                there is almost certainly a missing trailling quote mark
2552                as we have see too many newlines, < or > characters.
2553
2554                an exception is made for Javascript attributes and the
2555                javascript URL scheme which may legitimately include < and >
2556             */
2557             if (!AttributeTable.getDefaultAttributeTable().isScript(name) &&
2558                 !(AttributeTable.getDefaultAttributeTable().isUrl(name) &&
2559                   (getString(this.lexbuf, start, 11)).equals("javascript:")))
2560                     Report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
2561         }
2562
2563         len = this.lexsize - start;
2564         this.lexsize = start;
2565
2566         if (len > 0 || delim != 0)
2567             value = getString(this.lexbuf, start, len);
2568         else
2569             value = null;
2570
2571         /* note delimiter if given */
2572         if (delim != 0)
2573             pdelim.value = delim;
2574         else
2575             pdelim.value = (int)'"';
2576
2577         return value;
2578     }
2579
2580     /* attr must be non-null */
2581     public static boolean isValidAttrName(String attr)
2582     {
2583         short map;
2584         char c;
2585         int i;
2586
2587         /* first character should be a letter */
2588         c = attr.charAt(0);
2589         map = MAP(c);
2590
2591         if (!((map & LETTER) != 0))
2592             return false;
2593
2594         /* remaining characters should be namechars */
2595         for( i = 1; i < attr.length(); i++)
2596         {
2597             c = attr.charAt(i);
2598             map = MAP(c);
2599
2600             if((map & NAMECHAR) != 0)
2601                 continue;
2602
2603             return false;
2604         }
2605
2606         return true;
2607     }
2608
2609     /* swallows closing '>' */
2610
2611     public AttVal parseAttrs(MutableBoolean isempty)
2612     {
2613         AttVal av, list;
2614         String attribute, value;
2615         MutableInteger delim = new MutableInteger();
2616         MutableObject asp = new MutableObject();
2617         MutableObject php = new MutableObject();
2618
2619         list = null;
2620
2621         for (; !endOfInput();)
2622         {
2623             attribute = parseAttribute(isempty, asp, php);
2624
2625             if (attribute == null)
2626             {
2627                 /* check if attributes are created by ASP markup */
2628                 if (asp.getObject() != null)
2629                 {
2630                     av = new AttVal(list, null, (Node)asp.getObject(), null,
2631                                     '\0', null, null );
2632                     list = av;
2633                     continue;
2634                 }
2635
2636                 /* check if attributes are created by PHP markup */
2637                 if (php.getObject() != null)
2638                 {
2639                     av = new AttVal(list, null, null, (Node)php.getObject(),
2640                                     '\0', null, null );
2641                     list = av;
2642                     continue;
2643                 }
2644
2645                 break;
2646             }
2647
2648             value = parseValue(attribute, false, isempty, delim);
2649
2650             if (attribute != null && isValidAttrName(attribute))
2651             {
2652                 av = new AttVal( list, null, null, null,
2653                                  delim.value, attribute, value );
2654                 av.dict =
2655                     AttributeTable.getDefaultAttributeTable().findAttribute(av);
2656                 list = av;
2657             }
2658             else
2659             {
2660                 av = new AttVal( null, null, null, null,
2661                                  0, attribute, value );
2662                 Report.attrError(this, this.token, value, Report.BAD_ATTRIBUTE_VALUE);
2663             }
2664         }
2665
2666         return list;
2667     }
2668
2669     /*
2670       push a copy of an inline node onto stack
2671       but don't push if implicit or OBJECT or APPLET
2672       (implicit tags are ones generated from the istack)
2673
2674       One issue arises with pushing inlines when
2675       the tag is already pushed. For instance:
2676
2677           <p><em>text
2678           <p><em>more text
2679
2680       Shouldn't be mapped to
2681
2682           <p><em>text</em></p>
2683           <p><em><em>more text</em></em>
2684     */
2685     public void pushInline( Node node )
2686     {
2687         IStack is;
2688
2689         if (node.implicit)
2690             return;
2691
2692         if (node.tag == null)
2693             return;
2694
2695         if ((node.tag.model & Dict.CM_INLINE) == 0 )
2696             return;
2697
2698         if ((node.tag.model & Dict.CM_OBJECT) != 0)
2699             return;
2700
2701         if (node.tag != configuration.tt.tagFont && isPushed(node))
2702             return;
2703
2704         // make sure there is enough space for the stack
2705         is = new IStack();
2706         is.tag = node.tag;
2707         is.element = node.element;
2708         if (node.attributes != null)
2709             is.attributes = cloneAttributes(node.attributes);
2710         this.istack.push( is );
2711     }
2712
2713     /* pop inline stack */
2714     public void popInline( Node node )
2715     {
2716         AttVal av;
2717         IStack is;
2718
2719         if (node != null) {
2720
2721             if (node.tag == null)
2722                 return;
2723
2724             if ((node.tag.model & Dict.CM_INLINE) == 0)
2725                 return;
2726
2727             if ((node.tag.model & Dict.CM_OBJECT) != 0)
2728                 return;
2729
2730             // if node is </a> then pop until we find an <a>
2731             if (node.tag == configuration.tt.tagA) {
2732
2733                 while (this.istack.size() > 0) {
2734                     is = (IStack)this.istack.pop();
2735                     if (is.tag == configuration.tt.tagA) {
2736                         break;
2737                     }
2738                 }
2739
2740                 if (this.insert >= this.istack.size())
2741                     this.insert = -1;
2742                 return;
2743             }
2744         }
2745
2746         if (this.istack.size() > 0) {
2747             is = (IStack)this.istack.pop();
2748             if (this.insert >= this.istack.size())
2749                 this.insert = -1;
2750         }
2751     }
2752
2753     public boolean isPushed( Node node )
2754     {
2755         int i;
2756         IStack is;
2757
2758         for (i = this.istack.size() - 1; i >= 0; --i) {
2759             is = (IStack)this.istack.elementAt(i);
2760             if (is.tag == node.tag)
2761                 return true;
2762         }
2763
2764         return false;
2765     }
2766
2767     /*
2768       This has the effect of inserting "missing" inline
2769       elements around the contents of blocklevel elements
2770       such as P, TD, TH, DIV, PRE etc. This procedure is
2771       called at the start of ParseBlock. when the inline
2772       stack is not empty, as will be the case in:
2773
2774         <i><h1>italic heading</h1></i>
2775
2776       which is then treated as equivalent to
2777
2778         <h1><i>italic heading</i></h1>
2779
2780       This is implemented by setting the lexer into a mode
2781       where it gets tokens from the inline stack rather than
2782       from the input stream.
2783     */
2784     public int inlineDup( Node node )
2785     {
2786         int n;
2787
2788         n = this.istack.size() - this.istackbase;
2789         if ( n > 0 ) {
2790             this.insert = this.istackbase;
2791             this.inode = node;
2792         }
2793
2794         return n;
2795     }
2796
2797     public Node insertedToken()
2798     {
2799         Node node;
2800         IStack is;
2801         int n;
2802
2803         // this will only be null if inode != null
2804         if (this.insert == -1) {
2805             node = this.inode;
2806             this.inode = null;
2807             return node;
2808         }
2809
2810         // is this is the "latest" node then update
2811         // the position, otherwise use current values
2812
2813         if (this.inode == null) {
2814             this.lines = this.in.curline;
2815             this.columns = this.in.curcol;
2816         }
2817
2818         node = newNode(Node.StartTag,
2819                         this.lexbuf,
2820                         this.txtstart,
2821                         this.txtend);   // GLP:  Bugfix 126261.  Remove when this change
2822                                         //       is fixed in istack.c in the original Tidy
2823         node.implicit = true;
2824         is = (IStack)this.istack.elementAt( this.insert );
2825         node.element = is.element;
2826         node.tag = is.tag;
2827         if (is.attributes != null)
2828             node.attributes = cloneAttributes(is.attributes);
2829
2830         // advance lexer to next item on the stack
2831         n = this.insert;
2832
2833         // and recover state if we have reached the end
2834         if (++n < this.istack.size() ) {
2835             this.insert = n;
2836         } else {
2837             this.insert = -1;
2838         }
2839
2840         return node;
2841     }
2842
2843     /* AQ: Try this for speed optimization */
2844     public static int wstrcasecmp(String s1, String s2)
2845     {
2846         return (s1.equalsIgnoreCase(s2) ? 0 : 1);
2847     }
2848
2849     public static int wstrcaselexcmp(String s1, String s2)
2850     {
2851         char c;
2852         int i = 0;
2853
2854         while ( i < s1.length() && i < s2.length() ) {
2855             c = s1.charAt(i);
2856             if ( toLower(c) != toLower( s2.charAt(i) ) ) {
2857                 break;
2858             }
2859             i += 1;
2860         }
2861         if ( i == s1.length() && i == s2.length() ) {
2862             return 0;
2863         } else if ( i == s1.length() ) {
2864             return -1;
2865         } else if ( i == s2.length() ) {
2866             return 1;
2867         } else {
2868             return ( s1.charAt(i) > s2.charAt(i) ? 1 : -1 );
2869         }
2870     }
2871
2872     public static boolean wsubstr(String s1, String s2)
2873     {
2874         int i;
2875         int len1 = s1.length();
2876         int len2 = s2.length();
2877
2878         for (i = 0; i <= len1 - len2; ++i)
2879         {
2880             if (s2.equalsIgnoreCase(s1.substring(i)))
2881                 return true;
2882         }
2883
2884         return false;
2885     }
2886
2887     public boolean canPrune(Node element)
2888     {
2889         if (element.type == Node.TextNode)
2890             return true;
2891
2892         if (element.content != null)
2893             return false;
2894
2895         if (element.tag == configuration.tt.tagA && element.attributes != null)
2896             return false;
2897
2898         if (element.tag == configuration.tt.tagP && !this.configuration.DropEmptyParas)
2899             return false;
2900
2901         if (element.tag == null)
2902             return false;
2903
2904         if ((element.tag.model & Dict.CM_ROW) != 0)
2905             return false;
2906
2907         if (element.tag == configuration.tt.tagApplet)
2908             return false;
2909
2910         if (element.tag == configuration.tt.tagObject)
2911             return false;
2912
2913         if (element.attributes != null &&
2914             (element.getAttrByName("id") != null ||
2915                element.getAttrByName("name") != null) )
2916             return false;
2917
2918         return true;
2919     }
2920
2921     /* duplicate name attribute as an id */
2922     public void fixId(Node node)
2923     {
2924         AttVal name = node.getAttrByName("name");
2925         AttVal id = node.getAttrByName("id");
2926
2927         if (name != null)
2928         {
2929             if (id != null)
2930             {
2931                 if (!id.value.equals(name.value))
2932                     Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH);
2933             }
2934             else if (this.configuration.XmlOut)
2935                 node.addAttribute("id", name.value);
2936         }
2937     }
2938
2939     /*
2940      defer duplicates when entering a table or other
2941      element where the inlines shouldn't be duplicated
2942     */
2943     public void deferDup()
2944     {
2945         this.insert = -1;
2946         this.inode = null;
2947     }
2948
2949     /* Private methods and fields */
2950
2951     /* lexer char types */
2952     private static final short DIGIT       = 1;
2953     private static final short LETTER      = 2;
2954     private static final short NAMECHAR    = 4;
2955     private static final short WHITE       = 8;
2956     private static final short NEWLINE     = 16;
2957     private static final short LOWERCASE   = 32;
2958     private static final short UPPERCASE   = 64;
2959
2960     /* lexer GetToken states */
2961
2962     private static final short LEX_CONTENT     = 0;
2963     private static final short LEX_GT          = 1;
2964     private static final short LEX_ENDTAG      = 2;
2965     private static final short LEX_STARTTAG    = 3;
2966     private static final short LEX_COMMENT     = 4;
2967     private static final short LEX_DOCTYPE     = 5;
2968     private static final short LEX_PROCINSTR   = 6;
2969     private static final short LEX_ENDCOMMENT  = 7;
2970     private static final short LEX_CDATA       = 8;
2971     private static final short LEX_SECTION     = 9;
2972     private static final short LEX_ASP         = 10;
2973     private static final short LEX_JSTE        = 11;
2974     private static final short LEX_PHP         = 12;
2975
2976     /* used to classify chars for lexical purposes */
2977     private static short[] lexmap = new short[128];
2978
2979     private static void mapStr(String str, short code)
2980     {
2981         int j;
2982
2983         for ( int i = 0; i < str.length(); i++ ) {
2984             j = (int)str.charAt(i);
2985             lexmap[j] |= code;
2986         }
2987     }
2988
2989     static {
2990         mapStr("\r\n\f", (short)(NEWLINE|WHITE));
2991         mapStr(" \t", WHITE);
2992         mapStr("-.:_", NAMECHAR);
2993         mapStr("0123456789", (short)(DIGIT|NAMECHAR));
2994         mapStr("abcdefghijklmnopqrstuvwxyz", (short)(LOWERCASE|LETTER|NAMECHAR));
2995         mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short)(UPPERCASE|LETTER|NAMECHAR));
2996     }
2997
2998     private static short MAP( char c )
2999     {
3000         return ((int)c < 128 ? lexmap[(int)c] : 0);
3001     }
3002
3003     private static boolean isWhite(char c)
3004     {
3005         short m = MAP(c);
3006
3007         return (m & WHITE) != 0;
3008     }
3009
3010     private static boolean isDigit(char c)
3011     {
3012         short m;
3013
3014         m = MAP(c);
3015
3016         return (m & DIGIT) != 0;
3017     }
3018
3019     private static boolean isLetter(char c)
3020     {
3021         short m;
3022
3023         m = MAP(c);
3024
3025         return (m & LETTER) != 0;
3026     }
3027
3028     private static char toLower(char c)
3029     {
3030         short m = MAP(c);
3031
3032         if ((m & UPPERCASE) != 0)
3033             c = (char)( (int)c + (int)'a' - (int)'A' );
3034
3035         return c;
3036     }
3037
3038     private static char toUpper(char c)
3039     {
3040         short m = MAP(c);
3041
3042         if ((m & LOWERCASE) != 0)
3043             c = (char)( (int)c + (int)'A' - (int)'a' );
3044
3045         return c;
3046     }
3047
3048     public static char foldCase(char c, boolean tocaps, boolean xmlTags)
3049     {
3050         short m;
3051
3052         if (!xmlTags)
3053         {
3054             m = MAP(c);
3055
3056             if (tocaps)
3057             {
3058                 if ((m & LOWERCASE) != 0)
3059                     c = (char)( (int)c + (int)'A' - (int)'a' );
3060             }
3061             else /* force to lower case */
3062             {
3063                 if ((m & UPPERCASE) != 0)
3064                     c = (char)( (int)c + (int)'a' - (int)'A' );
3065             }
3066         }
3067
3068         return c;
3069     }
3070
3071
3072     private static class W3CVersionInfo
3073     {
3074         String name;
3075         String voyagerName;
3076         String profile;
3077         short code;
3078
3079         public W3CVersionInfo( String name,
3080                                String voyagerName,
3081                                String profile,
3082                                short code )
3083         {
3084             this.name = name;
3085             this.voyagerName = voyagerName;
3086             this.profile = profile;
3087             this.code = code;
3088         }
3089     }
3090
3091     /* the 3 URIs  for the XHTML 1.0 DTDs */
3092     private static final String voyager_loose    = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
3093     private static final String voyager_strict   = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
3094     private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
3095
3096     private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
3097
3098     private static Lexer.W3CVersionInfo[] W3CVersion =
3099     {
3100         new W3CVersionInfo("HTML 4.01",
3101                            "XHTML 1.0 Strict",
3102                            voyager_strict,
3103                            Dict.VERS_HTML40_STRICT),
3104         new W3CVersionInfo("HTML 4.01 Transitional",
3105                            "XHTML 1.0 Transitional",
3106                            voyager_loose,
3107                            Dict.VERS_HTML40_LOOSE),
3108         new W3CVersionInfo("HTML 4.01 Frameset",
3109                            "XHTML 1.0 Frameset",
3110                            voyager_frameset,
3111                            Dict.VERS_FRAMES),
3112         new W3CVersionInfo("HTML 4.0",
3113                            "XHTML 1.0 Strict",
3114                            voyager_strict,
3115                            Dict.VERS_HTML40_STRICT),
3116         new W3CVersionInfo("HTML 4.0 Transitional",
3117                            "XHTML 1.0 Transitional",
3118                            voyager_loose,
3119                            Dict.VERS_HTML40_LOOSE),
3120         new W3CVersionInfo("HTML 4.0 Frameset",
3121                            "XHTML 1.0 Frameset",
3122                            voyager_frameset,
3123                            Dict.VERS_FRAMES),
3124         new W3CVersionInfo("HTML 3.2",
3125                            "XHTML 1.0 Transitional",
3126                            voyager_loose,
3127                            Dict.VERS_HTML32),
3128         new W3CVersionInfo("HTML 2.0",
3129                            "XHTML 1.0 Strict",
3130                            voyager_strict,
3131                            Dict.VERS_HTML20)
3132     };
3133
3134 }