(translation to Java)
* @version 1.0, 1999/05/22
* @version 1.0.1, 1999/05/29
* @version 1.1, 1999/06/18 Java Bean
* @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
* @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
* @version 1.4, 1999/09/04 DOM support
* @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
* @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
* @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
* @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
* @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
* @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
* @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
*/
/*
Given a file stream fp it returns a sequence of tokens.
GetToken(fp) gets the next token
UngetToken(fp) provides one level undo
The tags include an attribute list:
- linked list of attribute/value nodes
- each node has 2 null-terminated strings.
- entities are replaced in attribute values
white space is compacted if not in preformatted mode
If not in preformatted mode then leading white space
is discarded and subsequent white space sequences
compacted to single space chars.
If XmlTags is no then Tag names are folded to upper
case and attribute names to lower case.
Not yet done:
- Doctype subset and marked sections
*/
import java.io.PrintWriter;
import java.util.Stack;
import java.util.Vector;
import org.eclipse.core.resources.IFile;
import sun.security.krb5.internal.av;
public class Lexer {
private IFile iFile;
public StreamIn in; /* file stream */
public PrintWriter errout; /* error output stream */
public short badAccess; /* for accessibility errors */
public short badLayout; /* for bad style errors */
public short badChars; /* for bad char encodings */
public short badForm; /* for mismatched/mispositioned form tags */
public short warnings; /* count of warnings in this document */
public short errors; /* count of errors */
public int lines; /* lines seen */
public int columns; /* at start of current token */
public boolean waswhite; /* used to collapse contiguous white space */
public boolean pushed; /* true after token has been pushed back */
public boolean insertspace; /* when space is moved after end tag */
public boolean excludeBlocks; /* Netscape compatibility */
public boolean exiled; /* true if moved out of table */
public boolean isvoyager; /* true if xmlns attribute on html element */
public short versions; /* bit vector of HTML versions */
public int doctype; /* version as given by doctype (if any) */
public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
public int txtstart; /* start of current node */
public int txtend; /* end of current node */
public short state; /* state of lexer's finite state machine */
public Node token;
/*
lexer character buffer
parse tree nodes span onto this buffer
which contains the concatenated text
contents of all of the elements.
lexsize must be reset for each file.
*/
public byte[] lexbuf; /* byte buffer of UTF-8 chars */
public int lexlength; /* allocated */
public int lexsize; /* used */
/* Inline stack for compatibility with Mosaic */
public Node inode; /* for deferring text node */
public int insert; /* for inferring inline tags */
public Stack istack;
public int istackbase; /* start of frame */
public Style styles; /* used for cleaning up presentation markup */
public Configuration configuration;
protected int seenBodyEndTag; /* used by parser */
private Vector nodeList;
public Lexer(IFile iFile, StreamIn in, Configuration configuration)
{
this.iFile = iFile;
this.in = in;
this.lines = 1;
this.columns = 1;
this.state = LEX_CONTENT;
this.badAccess = 0;
this.badLayout = 0;
this.badChars = 0;
this.badForm = 0;
this.warnings = 0;
this.errors = 0;
this.waswhite = false;
this.pushed = false;
this.insertspace = false;
this.exiled = false;
this.isvoyager = false;
this.versions = Dict.VERS_EVERYTHING;
this.doctype = Dict.VERS_UNKNOWN;
this.badDoctype = false;
this.txtstart = 0;
this.txtend = 0;
this.token = null;
this.lexbuf = null;
this.lexlength = 0;
this.lexsize = 0;
this.inode = null;
this.insert = -1;
this.istack = new Stack();
this.istackbase = 0;
this.styles = null;
this.configuration = configuration;
this.seenBodyEndTag = 0;
this.nodeList = new Vector();
}
public IFile getIFile() {
return iFile;
}
public Node newNode()
{
Node node = new Node();
nodeList.addElement(node);
return node;
}
public Node newNode(short type, byte[] textarray, int start, int end)
{
Node node = new Node(type, textarray, start, end);
nodeList.addElement(node);
return node;
}
public Node newNode(short type, byte[] textarray, int start, int end, String element)
{
Node node = new Node(type, textarray, start, end, element, configuration.tt);
nodeList.addElement(node);
return node;
}
public Node cloneNode(Node node)
{
Node cnode = (Node)node.clone();
nodeList.addElement(cnode);
for (AttVal att = cnode.attributes; att != null; att = att.next) {
if (att.asp != null)
nodeList.addElement(att.asp);
if (att.php != null)
nodeList.addElement(att.php);
}
return cnode;
}
public AttVal cloneAttributes(AttVal attrs)
{
AttVal cattrs = (AttVal)attrs.clone();
for (AttVal att = cattrs; att != null; att = att.next) {
if (att.asp != null)
nodeList.addElement(att.asp);
if (att.php != null)
nodeList.addElement(att.php);
}
return cattrs;
}
protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
{
Node node;
for (int i = 0; i < nodeList.size(); i++) {
node = (Node)(nodeList.elementAt(i));
if (node.textarray == oldtextarray)
node.textarray = newtextarray;
}
}
/* used for creating preformatted text from Word2000 */
public Node newLineNode()
{
Node node = newNode();
node.textarray = this.lexbuf;
node.start = this.lexsize;
addCharToLexer((int)'\n');
node.end = this.lexsize;
return node;
}
// Should always be able convert to/from UTF-8, so encoding exceptions are
// converted to an Error to avoid adding throws declarations in
// lots of methods.
public static byte[] getBytes(String str) {
try {
return str.getBytes("UTF8");
} catch (java.io.UnsupportedEncodingException e) {
throw new Error("string to UTF-8 conversion failed: " + e.getMessage());
}
}
public static String getString(byte[] bytes, int offset, int length) {
try {
return new String(bytes, offset, length, "UTF8");
} catch (java.io.UnsupportedEncodingException e) {
throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
}
}
public boolean endOfInput()
{
return this.in.isEndOfStream();
}
public void addByte(int c)
{
if (this.lexsize + 1 >= this.lexlength)
{
while (this.lexsize + 1 >= this.lexlength)
{
if (this.lexlength == 0)
this.lexlength = 8192;
else
this.lexlength = this.lexlength * 2;
}
byte[] temp = this.lexbuf;
this.lexbuf = new byte[ this.lexlength ];
if (temp != null)
{
System.arraycopy( temp, 0, this.lexbuf, 0, temp.length );
updateNodeTextArrays(temp, this.lexbuf);
}
}
this.lexbuf[this.lexsize++] = (byte)c;
this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */
}
public void changeChar(byte c)
{
if (this.lexsize > 0)
{
this.lexbuf[this.lexsize-1] = c;
}
}
/* store char c as UTF-8 encoded byte stream */
public void addCharToLexer(int c)
{
if (c < 128)
addByte(c);
else if (c <= 0x7FF)
{
addByte(0xC0 | (c >> 6));
addByte(0x80 | (c & 0x3F));
}
else if (c <= 0xFFFF)
{
addByte(0xE0 | (c >> 12));
addByte(0x80 | ((c >> 6) & 0x3F));
addByte(0x80 | (c & 0x3F));
}
else if (c <= 0x1FFFFF)
{
addByte(0xF0 | (c >> 18));
addByte(0x80 | ((c >> 12) & 0x3F));
addByte(0x80 | ((c >> 6) & 0x3F));
addByte(0x80 | (c & 0x3F));
}
else
{
addByte(0xF8 | (c >> 24));
addByte(0x80 | ((c >> 18) & 0x3F));
addByte(0x80 | ((c >> 12) & 0x3F));
addByte(0x80 | ((c >> 6) & 0x3F));
addByte(0x80 | (c & 0x3F));
}
}
public void addStringToLexer(String str)
{
for ( int i = 0; i < str.length(); i++ ) {
addCharToLexer( (int)str.charAt(i) );
}
}
/*
No longer attempts to insert missing ';' for unknown
enitities unless one was present already, since this
gives unexpected results.
For example:
was tidied to:
rather than:
My thanks for Maurice Buxton for spotting this.
*/
public void parseEntity(short mode)
{
short map;
int start;
boolean first = true;
boolean semicolon = false;
boolean numeric = false;
int c, ch, startcol;
String str;
start = this.lexsize - 1; /* to start at "&" */
startcol = this.in.curcol - 1;
while (true)
{
c = this.in.readChar();
if (c == StreamIn.EndOfStream) break;
if (c == ';')
{
semicolon = true;
break;
}
if (first && c == '#')
{
addCharToLexer(c);
first = false;
numeric = true;
continue;
}
first = false;
map = MAP((char)c);
/* AQ: Added flag for numeric entities so that numeric entities
with missing semi-colons are recognized.
Eg. "rep..." is recognized as "rep"
*/
if (numeric && ((c == 'x') || ((map & DIGIT) != 0)))
{
addCharToLexer(c);
continue;
}
if (!numeric && ((map & NAMECHAR) != 0))
{
addCharToLexer(c);
continue;
}
/* otherwise put it back */
this.in.ungetChar(c);
break;
}
str = getString( this.lexbuf, start, this.lexsize - start );
ch = EntityTable.getDefaultEntityTable().entityCode( str );
/* deal with unrecognized entities */
if (ch <= 0)
{
/* set error position just before offending chararcter */
this.lines = this.in.curline;
this.columns = startcol;
if (this.lexsize > start +1 )
{
Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
if (semicolon)
addCharToLexer(';');
}
else /* naked & */
{
Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
}
}
else
{
if (c != ';') /* issue warning if not terminated by ';' */
{
/* set error position just before offending chararcter */
this.lines = this.in.curline;
this.columns = startcol;
Report.entityError(this, Report.MISSING_SEMICOLON, str, c);
}
this.lexsize = start;
if (ch == 160 && (mode & Preformatted) != 0)
ch = ' ';
addCharToLexer(ch);
if (ch == '&' && !this.configuration.QuoteAmpersand)
{
addCharToLexer('a');
addCharToLexer('m');
addCharToLexer('p');
addCharToLexer(';');
}
}
}
public char parseTagName()
{
short map;
int c;
/* fold case of first char in buffer */
c = this.lexbuf[this.txtstart];
map = MAP((char)c);
if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
{
c += (int)((int)'a' - (int)'A');
this.lexbuf[this.txtstart] = (byte)c;
}
while (true)
{
c = this.in.readChar();
if (c == StreamIn.EndOfStream) break;
map = MAP((char)c);
if ((map & NAMECHAR) == 0)
break;
/* fold case of subsequent chars */
if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
c += (int)((int)'a' - (int)'A');
addCharToLexer(c);
}
this.txtend = this.lexsize;
return (char)c;
}
public void addStringLiteral(String str)
{
for ( int i = 0; i < str.length(); i++ ) {
addCharToLexer( (int)str.charAt(i) );
}
}
/* choose what version to use for new doctype */
public short HTMLVersion()
{
short versions;
versions = this.versions;
if ((versions & Dict.VERS_HTML20) != 0)
return Dict.VERS_HTML20;
if ((versions & Dict.VERS_HTML32) != 0)
return Dict.VERS_HTML32;
if ((versions & Dict.VERS_HTML40_STRICT) != 0)
return Dict.VERS_HTML40_STRICT;
if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
return Dict.VERS_HTML40_LOOSE;
if ((versions & Dict.VERS_FRAMES) != 0)
return Dict.VERS_FRAMES;
return Dict.VERS_UNKNOWN;
}
public String HTMLVersionName()
{
short guessed;
int j;
guessed = apparentVersion();
for (j = 0; j < W3CVersion.length; ++j)
{
if (guessed == W3CVersion[j].code)
{
if (this.isvoyager)
return W3CVersion[j].voyagerName;
return W3CVersion[j].name;
}
}
return null;
}
/* add meta element for Tidy */
public boolean addGenerator(Node root)
{
AttVal attval;
Node node;
Node head = root.findHEAD(configuration.tt);
if (head != null)
{
for (node = head.content; node != null; node = node.next)
{
if (node.tag == configuration.tt.tagMeta)
{
attval = node.getAttrByName("name");
if (attval != null && attval.value != null &&
Lexer.wstrcasecmp(attval.value, "generator") == 0)
{
attval = node.getAttrByName("content");
if (attval != null && attval.value != null &&
attval.value.length() >= 9 &&
Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0)
{
return false;
}
}
}
}
node = this.inferredTag("meta");
node.addAttribute("content", "HTML Tidy, see www.w3.org");
node.addAttribute("name", "generator");
Node.insertNodeAtStart(head, node);
return true;
}
return false;
}
/* return true if substring s is in p and isn't all in upper case */
/* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
/* len is how many chars to check in p */
private static boolean findBadSubString(String s, String p, int len)
{
int n = s.length();
int i = 0;
String ps;
while (n < len)
{
ps = p.substring(i, i + n);
if (wstrcasecmp(s, ps) == 0)
return (!ps.equals(s.substring(0, n)));
++i;
--len;
}
return false;
}
public boolean checkDocTypeKeyWords(Node doctype)
{
int len = doctype.end - doctype.start;
String s = getString(this.lexbuf, doctype.start, len);
return !(
findBadSubString("SYSTEM", s, len) ||
findBadSubString("PUBLIC", s, len) ||
findBadSubString("//DTD", s, len) ||
findBadSubString("//W3C", s, len) ||
findBadSubString("//EN", s, len)
);
}
/* examine to identify version */
public short findGivenVersion(Node doctype)
{
String p, s;
int i, j;
int len;
String str1;
String str2;
/* if root tag for doctype isn't html give up now */
str1 = getString(this.lexbuf, doctype.start, 5);
if (wstrcasecmp(str1, "html ") != 0)
return 0;
if (!checkDocTypeKeyWords(doctype))
Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
/* give up if all we are given is the system id for the doctype */
str1 = getString(this.lexbuf, doctype.start + 5, 7);
if (wstrcasecmp(str1, "SYSTEM ") == 0)
{
/* but at least ensure the case is correct */
if (!str1.substring(0, 6).equals("SYSTEM"))
System.arraycopy( getBytes("SYSTEM"), 0,
this.lexbuf, doctype.start + 5, 6 );
return 0; /* unrecognized */
}
if (wstrcasecmp(str1, "PUBLIC ") == 0)
{
if (!str1.substring(0, 6).equals("PUBLIC"))
System.arraycopy( getBytes("PUBLIC "), 0,
this.lexbuf, doctype.start + 5, 6 );
}
else
this.badDoctype = true;
for (i = doctype.start; i < doctype.end; ++i)
{
if (this.lexbuf[i] == (byte)'"')
{
str1 = getString( this.lexbuf, i + 1, 12 );
str2 = getString( this.lexbuf, i + 1, 13 );
if (str1.equals("-//W3C//DTD "))
{
/* compute length of identifier e.g. "HTML 4.0 Transitional" */
for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
len = j - i - 13;
p = getString( this.lexbuf, i + 13, len );
for (j = 1; j < W3CVersion.length; ++j)
{
s = W3CVersion[j].name;
if (len == s.length() && s.equals(p))
return W3CVersion[j].code;
}
/* else unrecognized version */
}
else if (str2.equals("-//IETF//DTD "))
{
/* compute length of identifier e.g. "HTML 2.0" */
for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
len = j - i - 14;
p = getString( this.lexbuf, i + 14, len );
s = W3CVersion[0].name;
if (len == s.length() && s.equals(p))
return W3CVersion[0].code;
/* else unrecognized version */
}
break;
}
}
return 0;
}
public void fixHTMLNameSpace(Node root, String profile)
{
Node node;
AttVal prev, attr;
for (node = root.content;
node != null && node.tag != configuration.tt.tagHtml; node = node.next);
if (node != null)
{
prev = null;
for (attr = node.attributes; attr != null; attr = attr.next)
{
if (attr.attribute.equals("xmlns"))
break;
prev = attr;
}
if (attr != null)
{
if (!attr.value.equals(profile))
{
Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
attr.value = profile;
}
}
else
{
attr = new AttVal( node.attributes, null, (int)'"',
"xmlns", profile );
attr.dict =
AttributeTable.getDefaultAttributeTable().findAttribute( attr );
node.attributes = attr;
}
}
}
public boolean setXHTMLDocType(Node root)
{
String fpi = " ";
String sysid = "";
String namespace = XHTML_NAMESPACE;
Node doctype;
doctype = root.findDocType();
if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
{
if (doctype != null)
Node.discardElement(doctype);
return true;
}
if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
{
/* see what flavor of XHTML this document matches */
if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
{ /* use XHTML strict */
fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
sysid = voyager_strict;
}
else if ((this.versions & Dict.VERS_LOOSE) != 0)
{
fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
sysid = voyager_loose;
}
else if ((this.versions & Dict.VERS_FRAMES) != 0)
{ /* use XHTML frames */
fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
sysid = voyager_frameset;
}
else /* lets assume XHTML transitional */
{
fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
sysid = voyager_loose;
}
}
else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
{
fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
sysid = voyager_strict;
}
else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
{
fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
sysid = voyager_loose;
}
fixHTMLNameSpace(root, namespace);
if (doctype == null)
{
doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
doctype.next = root.content;
doctype.parent = root;
doctype.prev = null;
root.content = doctype;
}
if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
configuration.docTypeStr != null)
{
fpi = configuration.docTypeStr;
sysid = "";
}
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
/* add public identifier */
addStringLiteral("html PUBLIC ");
/* check if the fpi is quoted or not */
if (fpi.charAt(0) == '"')
addStringLiteral(fpi);
else
{
addStringLiteral("\"");
addStringLiteral(fpi);
addStringLiteral("\"");
}
if (sysid.length() + 6 >= this.configuration.wraplen)
addStringLiteral("\n\"");
else
addStringLiteral("\n \"");
/* add system identifier */
addStringLiteral(sysid);
addStringLiteral("\"");
this.txtend = this.lexsize;
doctype.start = this.txtstart;
doctype.end = this.txtend;
return false;
}
public short apparentVersion()
{
switch (this.doctype)
{
case Dict.VERS_UNKNOWN:
return HTMLVersion();
case Dict.VERS_HTML20:
if ((this.versions & Dict.VERS_HTML20) != 0)
return Dict.VERS_HTML20;
break;
case Dict.VERS_HTML32:
if ((this.versions & Dict.VERS_HTML32) != 0)
return Dict.VERS_HTML32;
break; /* to replace old version by new */
case Dict.VERS_HTML40_STRICT:
if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
return Dict.VERS_HTML40_STRICT;
break;
case Dict.VERS_HTML40_LOOSE:
if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
return Dict.VERS_HTML40_LOOSE;
break; /* to replace old version by new */
case Dict.VERS_FRAMES:
if ((this.versions & Dict.VERS_FRAMES) != 0)
return Dict.VERS_FRAMES;
break;
}
Report.warning(this, null, null, Report.INCONSISTENT_VERSION);
return this.HTMLVersion();
}
/* fixup doctype if missing */
public boolean fixDocType(Node root)
{
Node doctype;
int guessed = Dict.VERS_HTML40_STRICT, i;
if (this.badDoctype)
Report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
if (configuration.XmlOut)
return true;
doctype = root.findDocType();
if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
{
if (doctype != null)
Node.discardElement(doctype);
return true;
}
if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
{
Node.discardElement(doctype);
doctype = null;
guessed = Dict.VERS_HTML40_STRICT;
}
else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
{
Node.discardElement(doctype);
doctype = null;
guessed = Dict.VERS_HTML40_LOOSE;
}
else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
{
if (doctype != null)
{
if (this.doctype == Dict.VERS_UNKNOWN)
return false;
switch (this.doctype)
{
case Dict.VERS_UNKNOWN:
return false;
case Dict.VERS_HTML20:
if ((this.versions & Dict.VERS_HTML20) != 0)
return true;
break; /* to replace old version by new */
case Dict.VERS_HTML32:
if ((this.versions & Dict.VERS_HTML32) != 0)
return true;
break; /* to replace old version by new */
case Dict.VERS_HTML40_STRICT:
if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
return true;
break; /* to replace old version by new */
case Dict.VERS_HTML40_LOOSE:
if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
return true;
break; /* to replace old version by new */
case Dict.VERS_FRAMES:
if ((this.versions & Dict.VERS_FRAMES) != 0)
return true;
break; /* to replace old version by new */
}
/* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
}
/* choose new doctype */
guessed = HTMLVersion();
}
if (guessed == Dict.VERS_UNKNOWN)
return false;
/* for XML use the Voyager system identifier */
if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager)
{
if (doctype != null)
Node.discardElement(doctype);
for (i = 0; i < W3CVersion.length; ++i)
{
if (guessed == W3CVersion[i].code)
{
fixHTMLNameSpace(root, W3CVersion[i].profile);
break;
}
}
return true;
}
if (doctype == null)
{
doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
doctype.next = root.content;
doctype.parent = root;
doctype.prev = null;
root.content = doctype;
}
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
/* use the appropriate public identifier */
addStringLiteral("html PUBLIC ");
if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
configuration.docTypeStr != null)
addStringLiteral(configuration.docTypeStr);
else if (guessed == Dict.VERS_HTML20)
addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
else
{
addStringLiteral("\"-//W3C//DTD ");
for (i = 0; i < W3CVersion.length; ++i)
{
if (guessed == W3CVersion[i].code)
{
addStringLiteral(W3CVersion[i].name);
break;
}
}
addStringLiteral("//EN\"");
}
this.txtend = this.lexsize;
doctype.start = this.txtstart;
doctype.end = this.txtend;
return true;
}
/* ensure XML document starts with */
public boolean fixXMLPI(Node root)
{
Node xml;
int s;
if( root.content != null && root.content.type == Node.ProcInsTag)
{
s = root.content.start;
if (this.lexbuf[s] == (byte)'x' &&
this.lexbuf[s+1] == (byte)'m' &&
this.lexbuf[s+2] == (byte)'l')
return true;
}
xml = newNode(Node.ProcInsTag, this.lexbuf, 0, 0);
xml.next = root.content;
if (root.content != null)
{
root.content.prev = xml;
xml.next = root.content;
}
root.content = xml;
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
addStringLiteral("xml version=\"1.0\"");
if (this.configuration.CharEncoding == Configuration.LATIN1)
addStringLiteral(" encoding=\"ISO-8859-1\"");
this.txtend = this.lexsize;
xml.start = this.txtstart;
xml.end = this.txtend;
return false;
}
public Node inferredTag(String name)
{
Node node;
node = newNode(Node.StartTag,
this.lexbuf,
this.txtstart,
this.txtend,
name);
node.implicit = true;
return node;
}
public static boolean expectsContent(Node node)
{
if (node.type != Node.StartTag)
return false;
/* unknown element? */
if (node.tag == null)
return true;
if ((node.tag.model & Dict.CM_EMPTY) != 0)
return false;
return true;
}
/*
create a text node for the contents of
a CDATA element like style or script
which ends with for some foo.
*/
public Node getCDATA(Node container)
{
int c, lastc, start, len, i;
String str;
boolean endtag = false;
this.lines = this.in.curline;
this.columns = this.in.curcol;
this.waswhite = false;
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
lastc = (int)'\0';
start = -1;
while (true)
{
c = this.in.readChar();
if (c == StreamIn.EndOfStream) break;
/* treat \r\n as \n and \r as \n */
if (c == (int)'/' && lastc == (int)'<')
{
if (endtag)
{
this.lines = this.in.curline;
this.columns = this.in.curcol - 3;
Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
}
start = this.lexsize + 1; /* to first letter */
endtag = true;
}
else if (c == (int)'>' && start >= 0)
{
len = this.lexsize - start;
if (len == container.element.length())
{
str = getString( this.lexbuf, start, len );
if (Lexer.wstrcasecmp(str, container.element) == 0)
{
this.txtend = start - 2;
break;
}
}
this.lines = this.in.curline;
this.columns = this.in.curcol - 3;
Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
/* if javascript insert backslash before / */
if (ParserImpl.isJavaScript(container))
{
for (i = this.lexsize; i > start-1; --i)
this.lexbuf[i] = this.lexbuf[i-1];
this.lexbuf[start-1] = (byte)'\\';
this.lexsize++;
}
start = -1;
}
else if (c == (int)'\r')
{
c = this.in.readChar();
if (c != (int)'\n')
this.in.ungetChar(c);
c = (int)'\n';
}
addCharToLexer((int)c);
this.txtend = this.lexsize;
lastc = c;
}
if (c == StreamIn.EndOfStream)
Report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TextNode,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
}
return null;
}
public void ungetToken()
{
this.pushed = true;
}
public static final short IgnoreWhitespace = 0;
public static final short MixedContent = 1;
public static final short Preformatted = 2;
public static final short IgnoreMarkup = 3;
/*
modes for GetToken()
MixedContent -- for elements which don't accept PCDATA
Preformatted -- white space preserved as is
IgnoreMarkup -- for CDATA elements such as script, style
*/
public Node getToken(short mode)
{
short map;
int c = 0;
int lastc;
int badcomment = 0;
MutableBoolean isempty = new MutableBoolean();
AttVal attributes;
if (this.pushed)
{
/* duplicate inlines in preference to pushed text nodes when appropriate */
if (this.token.type != Node.TextNode ||
(this.insert == -1 && this.inode == null))
{
this.pushed = false;
return this.token;
}
}
/* at start of block elements, unclosed inline
elements are inserted into the token stream */
if (this.insert != -1 || this.inode != null)
return insertedToken();
this.lines = this.in.curline;
this.columns = this.in.curcol;
this.waswhite = false;
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
while (true)
{
c = this.in.readChar();
if (c == StreamIn.EndOfStream) break;
if (this.insertspace && mode != IgnoreWhitespace)
{
addCharToLexer(' ');
this.waswhite = true;
this.insertspace = false;
}
/* treat \r\n as \n and \r as \n */
if (c == '\r')
{
c = this.in.readChar();
if (c != '\n')
this.in.ungetChar(c);
c = '\n';
}
addCharToLexer(c);
switch (this.state)
{
case LEX_CONTENT: /* element content */
map = MAP((char)c);
/*
Discard white space if appropriate. Its cheaper
to do this here rather than in parser methods
for elements that don't have mixed content.
*/
if (((map & WHITE) != 0) && (mode == IgnoreWhitespace)
&& this.lexsize == this.txtstart + 1)
{
--this.lexsize;
this.waswhite = false;
this.lines = this.in.curline;
this.columns = this.in.curcol;
continue;
}
if (c == '<')
{
this.state = LEX_GT;
continue;
}
if ((map & WHITE) != 0)
{
/* was previous char white? */
if (this.waswhite)
{
if (mode != Preformatted && mode != IgnoreMarkup)
{
--this.lexsize;
this.lines = this.in.curline;
this.columns = this.in.curcol;
}
}
else /* prev char wasn't white */
{
this.waswhite = true;
lastc = c;
if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
changeChar((byte)' ');
}
continue;
}
else if (c == '&' && mode != IgnoreMarkup)
parseEntity(mode);
/* this is needed to avoid trimming trailing whitespace */
if (mode == IgnoreWhitespace)
mode = MixedContent;
this.waswhite = false;
continue;
case LEX_GT: /* < */
/* check for endtag */
if (c == '/')
{
c = this.in.readChar();
if (c == StreamIn.EndOfStream)
{
this.in.ungetChar(c);
continue;
}
addCharToLexer(c);
map = MAP((char)c);
if ((map & LETTER) != 0)
{
this.lexsize -= 3;
this.txtend = this.lexsize;
this.in.ungetChar(c);
this.state = LEX_ENDTAG;
this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */
this.in.curcol -= 2;
/* if some text before the return it now */
if (this.txtend > this.txtstart)
{
/* trim space char before end tag */
if (mode == IgnoreWhitespace && this.lexbuf[this.lexsize - 1] == (byte)' ')
{
this.lexsize -= 1;
this.txtend = this.lexsize;
}
this.token = newNode(Node.TextNode,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
}
continue; /* no text so keep going */
}
/* otherwise treat as CDATA */
this.waswhite = false;
this.state = LEX_CONTENT;
continue;
}
if (mode == IgnoreMarkup)
{
/* otherwise treat as CDATA */
this.waswhite = false;
this.state = LEX_CONTENT;
continue;
}
/*
look out for comments, doctype or marked sections
this isn't quite right, but its getting there ...
*/
if (c == '!')
{
c = this.in.readChar();
if (c == '-')
{
c = this.in.readChar();
if (c == '-')
{
this.state = LEX_COMMENT; /* comment */
this.lexsize -= 2;
this.txtend = this.lexsize;
/* if some text before < return it now */
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TextNode,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
}
this.txtstart = this.lexsize;
continue;
}
Report.warning(this, null, null, Report.MALFORMED_COMMENT);
}
else if (c == 'd' || c == 'D')
{
this.state = LEX_DOCTYPE; /* doctype */
this.lexsize -= 2;
this.txtend = this.lexsize;
mode = IgnoreWhitespace;
/* skip until white space or '>' */
for (;;)
{
c = this.in.readChar();
if (c == StreamIn.EndOfStream || c == '>')
{
this.in.ungetChar(c);
break;
}
map = MAP((char)c);
if ((map & WHITE) == 0)
continue;
/* and skip to end of whitespace */
for (;;)
{
c = this.in.readChar();
if (c == StreamIn.EndOfStream || c == '>')
{
this.in.ungetChar(c);
break;
}
map = MAP((char)c);
if ((map & WHITE) != 0)
continue;
this.in.ungetChar(c);
break;
}
break;
}
/* if some text before < return it now */
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TextNode,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
}
this.txtstart = this.lexsize;
continue;
}
else if (c == '[')
{
/* Word 2000 embeds ... sequences */
this.lexsize -= 2;
this.state = LEX_SECTION;
this.txtend = this.lexsize;
/* if some text before < return it now */
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TextNode,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
}
this.txtstart = this.lexsize;
continue;
}
/* otherwise swallow chars up to and including next '>' */
while (true)
{
c = this.in.readChar();
if (c == '>') break;
if (c == -1)
{
this.in.ungetChar(c);
break;
}
}
this.lexsize -= 2;
this.lexbuf[this.lexsize] = (byte)'\0';
this.state = LEX_CONTENT;
continue;
}
/*
processing instructions
*/
if (c == '?')
{
this.lexsize -= 2;
this.state = LEX_PROCINSTR;
this.txtend = this.lexsize;
/* if some text before < return it now */
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TextNode,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
}
this.txtstart = this.lexsize;
continue;
}
/* Microsoft ASP's e.g. <% ... server-code ... %> */
if (c == '%')
{
this.lexsize -= 2;
this.state = LEX_ASP;
this.txtend = this.lexsize;
/* if some text before < return it now */
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TextNode,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
}
this.txtstart = this.lexsize;
continue;
}
/* Netscapes JSTE e.g. <# ... server-code ... #> */
if (c == '#')
{
this.lexsize -= 2;
this.state = LEX_JSTE;
this.txtend = this.lexsize;
/* if some text before < return it now */
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TextNode,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
}
this.txtstart = this.lexsize;
continue;
}
map = MAP((char)c);
/* check for start tag */
if ((map & LETTER) != 0)
{
this.in.ungetChar(c); /* push back letter */
this.lexsize -= 2; /* discard "<" + letter */
this.txtend = this.lexsize;
this.state = LEX_STARTTAG; /* ready to read tag name */
/* if some text before < return it now */
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TextNode,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
}
continue; /* no text so keep going */
}
/* otherwise treat as CDATA */
this.state = LEX_CONTENT;
this.waswhite = false;
continue;
case LEX_ENDTAG: /* ' */
while (c != '>')
{
c = this.in.readChar();
if (c == StreamIn.EndOfStream)
break;
}
if (c == StreamIn.EndOfStream)
{
this.in.ungetChar(c);
continue;
}
this.state = LEX_CONTENT;
this.waswhite = false;
return this.token; /* the endtag token */
case LEX_STARTTAG: /* first letter of tagname */
this.txtstart = this.lexsize - 1; /* set txtstart to first letter */
c = parseTagName();
isempty.value = false;
attributes = null;
this.token = newNode((isempty.value ? Node.StartEndTag : Node.StartTag),
this.lexbuf,
this.txtstart,
this.txtend,
getString(this.lexbuf,
this.txtstart,
this.txtend - this.txtstart));
/* parse attributes, consuming closing ">" */
if (c != '>')
{
if (c == '/')
this.in.ungetChar(c);
attributes = parseAttrs(isempty);
}
if (isempty.value)
this.token.type = Node.StartEndTag;
this.token.attributes = attributes;
this.lexsize = this.txtstart;
this.txtend = this.txtstart;
/* swallow newline following start tag */
/* special check needed for CRLF sequence */
/* this doesn't apply to empty elements */
if (expectsContent(this.token) ||
this.token.tag == configuration.tt.tagBr)
{
c = this.in.readChar();
if (c == '\r')
{
c = this.in.readChar();
if (c != '\n')
this.in.ungetChar(c);
}
else if (c != '\n' && c != '\f')
this.in.ungetChar(c);
this.waswhite = true; /* to swallow leading whitespace */
}
else
this.waswhite = false;
this.state = LEX_CONTENT;
if (this.token.tag == null)
Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
else if (!this.configuration.XmlTags)
{
this.versions &= this.token.tag.versions;
if ((this.token.tag.versions & Dict.VERS_PROPRIETARY) != 0)
{
if (!this.configuration.MakeClean && (this.token.tag == configuration.tt.tagNobr ||
this.token.tag == configuration.tt.tagWbr))
Report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
}
if (this.token.tag.chkattrs != null)
{
this.token.checkUniqueAttributes(this);
this.token.tag.chkattrs.check(this, this.token);
}
else
this.token.checkAttributes(this);
}
return this.token; /* return start tag */
case LEX_COMMENT: /* seen */
if (c != '-')
continue;
c = this.in.readChar();
addCharToLexer(c);
if (c != '-')
continue;
end_comment: while (true) {
c = this.in.readChar();
if (c == '>')
{
if (badcomment != 0)
Report.warning(this, null, null, Report.MALFORMED_COMMENT);
this.txtend = this.lexsize - 2; // AQ 8Jul2000
this.lexbuf[this.lexsize] = (byte)'\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.CommentTag,
this.lexbuf,
this.txtstart,
this.txtend);
/* now look for a line break */
c = this.in.readChar();
if (c == '\r')
{
c = this.in.readChar();
if (c != '\n')
this.token.linebreak = true;
}
if (c == '\n')
this.token.linebreak = true;
else
this.in.ungetChar(c);
return this.token;
}
/* note position of first such error in the comment */
if (badcomment == 0)
{
this.lines = this.in.curline;
this.columns = this.in.curcol - 3;
}
badcomment++;
if (this.configuration.FixComments)
this.lexbuf[this.lexsize - 2] = (byte)'=';
addCharToLexer(c);
/* if '-' then look for '>' to end the comment */
if (c != '-')
break end_comment;
}
/* otherwise continue to look for --> */
this.lexbuf[this.lexsize - 2] = (byte)'=';
continue;
case LEX_DOCTYPE: /* seen ' munging whitespace */
map = MAP((char)c);
if ((map & WHITE) != 0)
{
if (this.waswhite)
this.lexsize -= 1;
this.waswhite = true;
}
else
this.waswhite = false;
if (c != '>')
continue;
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte)'\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.DocTypeTag,
this.lexbuf,
this.txtstart,
this.txtend);
/* make a note of the version named by the doctype */
this.doctype = findGivenVersion(this.token);
return this.token;
case LEX_PROCINSTR: /* seen so look for '>' */
/* check for PHP preprocessor instructions */
if (this.lexsize - this.txtstart == 3)
{
if ((getString(this.lexbuf, this.txtstart, 3)).equals("php"))
{
this.state = LEX_PHP;
continue;
}
}
if (this.configuration.XmlPIs) /* insist on ?> as terminator */
{
if (c != '?')
continue;
/* now look for '>' */
c = this.in.readChar();
if (c == StreamIn.EndOfStream)
{
Report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
this.in.ungetChar(c);
continue;
}
addCharToLexer(c);
}
if (c != '>')
continue;
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte)'\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.ProcInsTag,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
case LEX_ASP: /* seen <% so look for "%>" */
if (c != '%')
continue;
/* now look for '>' */
c = this.in.readChar();
if (c != '>')
{
this.in.ungetChar(c);
continue;
}
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte)'\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.AspTag,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
case LEX_JSTE: /* seen <# so look for "#>" */
if (c != '#')
continue;
/* now look for '>' */
c = this.in.readChar();
if (c != '>')
{
this.in.ungetChar(c);
continue;
}
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte)'\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.JsteTag,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
case LEX_PHP: /* seen "" */
if (c != '?')
continue;
/* now look for '>' */
c = this.in.readChar();
if (c != '>')
{
this.in.ungetChar(c);
continue;
}
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte)'\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.PhpTag,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
case LEX_SECTION: /* seen "" */
if (c == '[')
{
if (this.lexsize == (this.txtstart + 6) &&
(getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
{
this.state = LEX_CDATA;
this.lexsize -= 6;
continue;
}
}
if (c != ']')
continue;
/* now look for '>' */
c = this.in.readChar();
if (c != '>')
{
this.in.ungetChar(c);
continue;
}
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte)'\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.SectionTag,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
case LEX_CDATA: /* seen "" */
if (c != ']')
continue;
/* now look for ']' */
c = this.in.readChar();
if (c != ']')
{
this.in.ungetChar(c);
continue;
}
/* now look for '>' */
c = this.in.readChar();
if (c != '>')
{
this.in.ungetChar(c);
continue;
}
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte)'\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.CDATATag,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
}
}
if (this.state == LEX_CONTENT) /* text string */
{
this.txtend = this.lexsize;
if (this.txtend > this.txtstart)
{
this.in.ungetChar(c);
if (this.lexbuf[this.lexsize - 1] == (byte)' ')
{
this.lexsize -= 1;
this.txtend = this.lexsize;
}
this.token = newNode(Node.TextNode,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
}
}
else if (this.state == LEX_COMMENT) /* comment */
{
if (c == StreamIn.EndOfStream)
Report.warning(this, null, null, Report.MALFORMED_COMMENT);
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte)'\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.CommentTag,
this.lexbuf,
this.txtstart,
this.txtend);
return this.token;
}
return null;
}
/*
parser for ASP within start tags
Some people use ASP for to customize attributes
Tidy isn't really well suited to dealing with ASP
This is a workaround for attributes, but won't
deal with the case where the ASP is used to tailor
the attribute value. Here is an example of a work
around for using ASP in attribute values:
href="<%=rsSchool.Fields("ID").Value%>"
where the ASP that generates the attribute value
is masked from Tidy by the quotemarks.
*/
public Node parseAsp()
{
int c;
Node asp = null;
this.txtstart = this.lexsize;
for (;;)
{
c = this.in.readChar();
addCharToLexer(c);
if (c != '%')
continue;
c = this.in.readChar();
addCharToLexer(c);
if (c == '>')
break;
}
this.lexsize -= 2;
this.txtend = this.lexsize;
if (this.txtend > this.txtstart)
asp = newNode(Node.AspTag,
this.lexbuf,
this.txtstart,
this.txtend);
this.txtstart = this.txtend;
return asp;
}
/*
PHP is like ASP but is based upon XML
processing instructions, e.g.
*/
public Node parsePhp()
{
int c;
Node php = null;
this.txtstart = this.lexsize;
for (;;)
{
c = this.in.readChar();
addCharToLexer(c);
if (c != '?')
continue;
c = this.in.readChar();
addCharToLexer(c);
if (c == '>')
break;
}
this.lexsize -= 2;
this.txtend = this.lexsize;
if (this.txtend > this.txtstart)
php = newNode(Node.PhpTag,
this.lexbuf,
this.txtstart,
this.txtend);
this.txtstart = this.txtend;
return php;
}
/* consumes the '>' terminating start tags */
public String parseAttribute(MutableBoolean isempty, MutableObject asp,
MutableObject php)
{
int start = 0;
// int len = 0; Removed by BUGFIX for 126265
short map;
String attr;
int c = 0;
asp.setObject(null); /* clear asp pointer */
php.setObject(null); /* clear php pointer */
/* skip white space before the attribute */
for (;;)
{
c = this.in.readChar();
if (c == '/')
{
c = this.in.readChar();
if (c == '>')
{
isempty.value = true;
return null;
}
this.in.ungetChar(c);
c = '/';
break;
}
if (c == '>')
return null;
if (c =='<')
{
c = this.in.readChar();
if (c == '%')
{
asp.setObject(parseAsp());
return null;
}
else if (c == '?')
{
php.setObject(parsePhp());
return null;
}
this.in.ungetChar(c);
Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
return null;
}
if (c == '"' || c == '\'')
{
Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
continue;
}
if (c == StreamIn.EndOfStream)
{
Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
this.in.ungetChar(c);
return null;
}
map = MAP((char)c);
if ((map & WHITE) == 0)
break;
}
start = this.lexsize;
for (;;)
{
/* but push back '=' for parseValue() */
if (c == '=' || c == '>')
{
this.in.ungetChar(c);
break;
}
if (c == '<' || c == StreamIn.EndOfStream)
{
this.in.ungetChar(c);
break;
}
map = MAP((char)c);
if ((map & WHITE) != 0)
break;
/* what should be done about non-namechar characters? */
/* currently these are incorporated into the attr name */
if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
c += (int)('a' - 'A');
// ++len; Removed by BUGFIX for 126265
addCharToLexer(c);
c = this.in.readChar();
}
// Following line added by GLP to fix BUG 126265. This is a temporary comment
// and should be removed when Tidy is fixed.
int len = this.lexsize - start;
attr = (len > 0 ? getString(this.lexbuf, start, len) : null);
this.lexsize = start;
return attr;
}
/*
invoked when < is seen in place of attribute value
but terminates on whitespace if not ASP, PHP or Tango
this routine recognizes ' and " quoted strings
*/
public int parseServerInstruction()
{
int c, map, delim = '"';
boolean isrule = false;
c = this.in.readChar();
addCharToLexer(c);
/* check for ASP, PHP or Tango */
if (c == '%' || c == '?' || c == '@')
isrule = true;
for (;;)
{
c = this.in.readChar();
if (c == StreamIn.EndOfStream)
break;
if (c == '>')
{
if (isrule)
addCharToLexer(c);
else
this.in.ungetChar(c);
break;
}
/* if not recognized as ASP, PHP or Tango */
/* then also finish value on whitespace */
if (!isrule)
{
map = MAP((char)c);
if ((map & WHITE) != 0)
break;
}
addCharToLexer(c);
if (c == '"')
{
do
{
c = this.in.readChar();
addCharToLexer(c);
}
while (c != '"');
delim = '\'';
continue;
}
if (c == '\'')
{
do
{
c = this.in.readChar();
addCharToLexer(c);
}
while (c != '\'');
}
}
return delim;
}
/* values start with "=" or " = " etc. */
/* doesn't consume the ">" at end of start tag */
public String parseValue(String name, boolean foldCase,
MutableBoolean isempty, MutableInteger pdelim)
{
int len = 0;
int start;
short map;
boolean seen_gt = false;
boolean munge = true;
int c = 0;
int lastc, delim, quotewarning;
String value;
delim = 0;
pdelim.value = (int)'"';
/*
Henry Zrepa reports that some folk are using the
embed element with script attributes where newlines
are significant and must be preserved
*/
if (configuration.LiteralAttribs)
munge = false;
/* skip white space before the '=' */
for (;;)
{
c = this.in.readChar();
if (c == StreamIn.EndOfStream)
{
this.in.ungetChar(c);
break;
}
map = MAP((char)c);
if ((map & WHITE) == 0)
break;
}
/*
c should be '=' if there is a value
other legal possibilities are white
space, '/' and '>'
*/
if (c != '=')
{
this.in.ungetChar(c);
return null;
}
/* skip white space after '=' */
for (;;)
{
c = this.in.readChar();
if (c == StreamIn.EndOfStream)
{
this.in.ungetChar(c);
break;
}
map = MAP((char)c);
if ((map & WHITE) == 0)
break;
}
/* check for quote marks */
if (c == '"' || c == '\'')
delim = c;
else if (c == '<')
{
start = this.lexsize;
addCharToLexer(c);
pdelim.value = parseServerInstruction();
len = this.lexsize - start;
this.lexsize = start;
return (len > 0 ? getString(this.lexbuf, start, len) : null);
}
else
this.in.ungetChar(c);
/*
and read the value string
check for quote mark if needed
*/
quotewarning = 0;
start = this.lexsize;
c = '\0';
for (;;)
{
lastc = c; /* track last character */
c = this.in.readChar();
if (c == StreamIn.EndOfStream)
{
Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
this.in.ungetChar(c);
break;
}
if (delim == (char)0)
{
if (c == '>')
{
this.in.ungetChar(c);
break;
}
if (c == '"' || c == '\'')
{
Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
break;
}
if (c == '<')
{
/* this.in.ungetChar(c); */
Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
/* break; */
}
/*
For cases like
need to avoid treating /> as
part of the attribute value, however care is needed to avoid
so treating in this way, which
would map the tag to
*/
if (c == '/')
{
/* peek ahead in case of /> */
c = this.in.readChar();
if (c == '>' &&
!AttributeTable.getDefaultAttributeTable().isUrl(name))
{
isempty.value = true;
this.in.ungetChar(c);
break;
}
/* unget peeked char */
this.in.ungetChar(c);
c = '/';
}
}
else /* delim is '\'' or '"' */
{
if (c == delim)
break;
/* treat CRLF, CR and LF as single line break */
if (c == '\r')
{
c = this.in.readChar();
if (c != '\n')
this.in.ungetChar(c);
c = '\n';
}
if (c == '\n' || c == '<' || c == '>')
++quotewarning;
if (c == '>')
seen_gt = true;
}
if (c == '&')
{
addCharToLexer(c);
parseEntity((short)0);
continue;
}
/*
kludge for JavaScript attribute values
with line continuations in string literals
*/
if (c == '\\')
{
c = this.in.readChar();
if (c != '\n')
{
this.in.ungetChar(c);
c = '\\';
}
}
map = MAP((char)c);
if ((map & WHITE) != 0)
{
if (delim == (char)0)
break;
if (munge)
{
c = ' ';
if (lastc == ' ')
continue;
}
}
else if (foldCase && (map & UPPERCASE) != 0)
c += (int)('a' - 'A');
addCharToLexer(c);
}
if (quotewarning > 10 && seen_gt && munge)
{
/*
there is almost certainly a missing trailling quote mark
as we have see too many newlines, < or > characters.
an exception is made for Javascript attributes and the
javascript URL scheme which may legitimately include < and >
*/
if (!AttributeTable.getDefaultAttributeTable().isScript(name) &&
!(AttributeTable.getDefaultAttributeTable().isUrl(name) &&
(getString(this.lexbuf, start, 11)).equals("javascript:")))
Report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
}
len = this.lexsize - start;
this.lexsize = start;
if (len > 0 || delim != 0)
value = getString(this.lexbuf, start, len);
else
value = null;
/* note delimiter if given */
if (delim != 0)
pdelim.value = delim;
else
pdelim.value = (int)'"';
return value;
}
/* attr must be non-null */
public static boolean isValidAttrName(String attr)
{
short map;
char c;
int i;
/* first character should be a letter */
c = attr.charAt(0);
map = MAP(c);
if (!((map & LETTER) != 0))
return false;
/* remaining characters should be namechars */
for( i = 1; i < attr.length(); i++)
{
c = attr.charAt(i);
map = MAP(c);
if((map & NAMECHAR) != 0)
continue;
return false;
}
return true;
}
/* swallows closing '>' */
public AttVal parseAttrs(MutableBoolean isempty)
{
AttVal av, list;
String attribute, value;
MutableInteger delim = new MutableInteger();
MutableObject asp = new MutableObject();
MutableObject php = new MutableObject();
list = null;
for (; !endOfInput();)
{
attribute = parseAttribute(isempty, asp, php);
if (attribute == null)
{
/* check if attributes are created by ASP markup */
if (asp.getObject() != null)
{
av = new AttVal(list, null, (Node)asp.getObject(), null,
'\0', null, null );
list = av;
continue;
}
/* check if attributes are created by PHP markup */
if (php.getObject() != null)
{
av = new AttVal(list, null, null, (Node)php.getObject(),
'\0', null, null );
list = av;
continue;
}
break;
}
value = parseValue(attribute, false, isempty, delim);
if (attribute != null && isValidAttrName(attribute))
{
av = new AttVal( list, null, null, null,
delim.value, attribute, value );
av.dict =
AttributeTable.getDefaultAttributeTable().findAttribute(av);
list = av;
}
else
{
av = new AttVal( null, null, null, null,
0, attribute, value );
Report.attrError(this, this.token, value, Report.BAD_ATTRIBUTE_VALUE);
}
}
return list;
}
/*
push a copy of an inline node onto stack
but don't push if implicit or OBJECT or APPLET
(implicit tags are ones generated from the istack)
One issue arises with pushing inlines when
the tag is already pushed. For instance:
text
more text
Shouldn't be mapped to
text
more text
*/
public void pushInline( Node node )
{
IStack is;
if (node.implicit)
return;
if (node.tag == null)
return;
if ((node.tag.model & Dict.CM_INLINE) == 0 )
return;
if ((node.tag.model & Dict.CM_OBJECT) != 0)
return;
if (node.tag != configuration.tt.tagFont && isPushed(node))
return;
// make sure there is enough space for the stack
is = new IStack();
is.tag = node.tag;
is.element = node.element;
if (node.attributes != null)
is.attributes = cloneAttributes(node.attributes);
this.istack.push( is );
}
/* pop inline stack */
public void popInline( Node node )
{
AttVal av;
IStack is;
if (node != null) {
if (node.tag == null)
return;
if ((node.tag.model & Dict.CM_INLINE) == 0)
return;
if ((node.tag.model & Dict.CM_OBJECT) != 0)
return;
// if node is then pop until we find an
if (node.tag == configuration.tt.tagA) {
while (this.istack.size() > 0) {
is = (IStack)this.istack.pop();
if (is.tag == configuration.tt.tagA) {
break;
}
}
if (this.insert >= this.istack.size())
this.insert = -1;
return;
}
}
if (this.istack.size() > 0) {
is = (IStack)this.istack.pop();
if (this.insert >= this.istack.size())
this.insert = -1;
}
}
public boolean isPushed( Node node )
{
int i;
IStack is;
for (i = this.istack.size() - 1; i >= 0; --i) {
is = (IStack)this.istack.elementAt(i);
if (is.tag == node.tag)
return true;
}
return false;
}
/*
This has the effect of inserting "missing" inline
elements around the contents of blocklevel elements
such as P, TD, TH, DIV, PRE etc. This procedure is
called at the start of ParseBlock. when the inline
stack is not empty, as will be the case in:
italic heading
which is then treated as equivalent to
italic heading
This is implemented by setting the lexer into a mode
where it gets tokens from the inline stack rather than
from the input stream.
*/
public int inlineDup( Node node )
{
int n;
n = this.istack.size() - this.istackbase;
if ( n > 0 ) {
this.insert = this.istackbase;
this.inode = node;
}
return n;
}
public Node insertedToken()
{
Node node;
IStack is;
int n;
// this will only be null if inode != null
if (this.insert == -1) {
node = this.inode;
this.inode = null;
return node;
}
// is this is the "latest" node then update
// the position, otherwise use current values
if (this.inode == null) {
this.lines = this.in.curline;
this.columns = this.in.curcol;
}
node = newNode(Node.StartTag,
this.lexbuf,
this.txtstart,
this.txtend); // GLP: Bugfix 126261. Remove when this change
// is fixed in istack.c in the original Tidy
node.implicit = true;
is = (IStack)this.istack.elementAt( this.insert );
node.element = is.element;
node.tag = is.tag;
if (is.attributes != null)
node.attributes = cloneAttributes(is.attributes);
// advance lexer to next item on the stack
n = this.insert;
// and recover state if we have reached the end
if (++n < this.istack.size() ) {
this.insert = n;
} else {
this.insert = -1;
}
return node;
}
/* AQ: Try this for speed optimization */
public static int wstrcasecmp(String s1, String s2)
{
return (s1.equalsIgnoreCase(s2) ? 0 : 1);
}
public static int wstrcaselexcmp(String s1, String s2)
{
char c;
int i = 0;
while ( i < s1.length() && i < s2.length() ) {
c = s1.charAt(i);
if ( toLower(c) != toLower( s2.charAt(i) ) ) {
break;
}
i += 1;
}
if ( i == s1.length() && i == s2.length() ) {
return 0;
} else if ( i == s1.length() ) {
return -1;
} else if ( i == s2.length() ) {
return 1;
} else {
return ( s1.charAt(i) > s2.charAt(i) ? 1 : -1 );
}
}
public static boolean wsubstr(String s1, String s2)
{
int i;
int len1 = s1.length();
int len2 = s2.length();
for (i = 0; i <= len1 - len2; ++i)
{
if (s2.equalsIgnoreCase(s1.substring(i)))
return true;
}
return false;
}
public boolean canPrune(Node element)
{
if (element.type == Node.TextNode)
return true;
if (element.content != null)
return false;
if (element.tag == configuration.tt.tagA && element.attributes != null)
return false;
if (element.tag == configuration.tt.tagP && !this.configuration.DropEmptyParas)
return false;
if (element.tag == null)
return false;
if ((element.tag.model & Dict.CM_ROW) != 0)
return false;
if (element.tag == configuration.tt.tagApplet)
return false;
if (element.tag == configuration.tt.tagObject)
return false;
if (element.attributes != null &&
(element.getAttrByName("id") != null ||
element.getAttrByName("name") != null) )
return false;
return true;
}
/* duplicate name attribute as an id */
public void fixId(Node node)
{
AttVal name = node.getAttrByName("name");
AttVal id = node.getAttrByName("id");
if (name != null)
{
if (id != null)
{
if (!id.value.equals(name.value))
Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH);
}
else if (this.configuration.XmlOut)
node.addAttribute("id", name.value);
}
}
/*
defer duplicates when entering a table or other
element where the inlines shouldn't be duplicated
*/
public void deferDup()
{
this.insert = -1;
this.inode = null;
}
/* Private methods and fields */
/* lexer char types */
private static final short DIGIT = 1;
private static final short LETTER = 2;
private static final short NAMECHAR = 4;
private static final short WHITE = 8;
private static final short NEWLINE = 16;
private static final short LOWERCASE = 32;
private static final short UPPERCASE = 64;
/* lexer GetToken states */
private static final short LEX_CONTENT = 0;
private static final short LEX_GT = 1;
private static final short LEX_ENDTAG = 2;
private static final short LEX_STARTTAG = 3;
private static final short LEX_COMMENT = 4;
private static final short LEX_DOCTYPE = 5;
private static final short LEX_PROCINSTR = 6;
private static final short LEX_ENDCOMMENT = 7;
private static final short LEX_CDATA = 8;
private static final short LEX_SECTION = 9;
private static final short LEX_ASP = 10;
private static final short LEX_JSTE = 11;
private static final short LEX_PHP = 12;
/* used to classify chars for lexical purposes */
private static short[] lexmap = new short[128];
private static void mapStr(String str, short code)
{
int j;
for ( int i = 0; i < str.length(); i++ ) {
j = (int)str.charAt(i);
lexmap[j] |= code;
}
}
static {
mapStr("\r\n\f", (short)(NEWLINE|WHITE));
mapStr(" \t", WHITE);
mapStr("-.:_", NAMECHAR);
mapStr("0123456789", (short)(DIGIT|NAMECHAR));
mapStr("abcdefghijklmnopqrstuvwxyz", (short)(LOWERCASE|LETTER|NAMECHAR));
mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short)(UPPERCASE|LETTER|NAMECHAR));
}
private static short MAP( char c )
{
return ((int)c < 128 ? lexmap[(int)c] : 0);
}
private static boolean isWhite(char c)
{
short m = MAP(c);
return (m & WHITE) != 0;
}
private static boolean isDigit(char c)
{
short m;
m = MAP(c);
return (m & DIGIT) != 0;
}
private static boolean isLetter(char c)
{
short m;
m = MAP(c);
return (m & LETTER) != 0;
}
private static char toLower(char c)
{
short m = MAP(c);
if ((m & UPPERCASE) != 0)
c = (char)( (int)c + (int)'a' - (int)'A' );
return c;
}
private static char toUpper(char c)
{
short m = MAP(c);
if ((m & LOWERCASE) != 0)
c = (char)( (int)c + (int)'A' - (int)'a' );
return c;
}
public static char foldCase(char c, boolean tocaps, boolean xmlTags)
{
short m;
if (!xmlTags)
{
m = MAP(c);
if (tocaps)
{
if ((m & LOWERCASE) != 0)
c = (char)( (int)c + (int)'A' - (int)'a' );
}
else /* force to lower case */
{
if ((m & UPPERCASE) != 0)
c = (char)( (int)c + (int)'a' - (int)'A' );
}
}
return c;
}
private static class W3CVersionInfo
{
String name;
String voyagerName;
String profile;
short code;
public W3CVersionInfo( String name,
String voyagerName,
String profile,
short code )
{
this.name = name;
this.voyagerName = voyagerName;
this.profile = profile;
this.code = code;
}
}
/* the 3 URIs for the XHTML 1.0 DTDs */
private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
private static Lexer.W3CVersionInfo[] W3CVersion =
{
new W3CVersionInfo("HTML 4.01",
"XHTML 1.0 Strict",
voyager_strict,
Dict.VERS_HTML40_STRICT),
new W3CVersionInfo("HTML 4.01 Transitional",
"XHTML 1.0 Transitional",
voyager_loose,
Dict.VERS_HTML40_LOOSE),
new W3CVersionInfo("HTML 4.01 Frameset",
"XHTML 1.0 Frameset",
voyager_frameset,
Dict.VERS_FRAMES),
new W3CVersionInfo("HTML 4.0",
"XHTML 1.0 Strict",
voyager_strict,
Dict.VERS_HTML40_STRICT),
new W3CVersionInfo("HTML 4.0 Transitional",
"XHTML 1.0 Transitional",
voyager_loose,
Dict.VERS_HTML40_LOOSE),
new W3CVersionInfo("HTML 4.0 Frameset",
"XHTML 1.0 Frameset",
voyager_frameset,
Dict.VERS_FRAMES),
new W3CVersionInfo("HTML 3.2",
"XHTML 1.0 Transitional",
voyager_loose,
Dict.VERS_HTML32),
new W3CVersionInfo("HTML 2.0",
"XHTML 1.0 Strict",
voyager_strict,
Dict.VERS_HTML20)
};
}