2 * @(#)Tidy.java 1.11 2000/08/16
7 HTML parser and pretty printer
9 Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
10 Institute of Technology, Institut National de Recherche en
11 Informatique et en Automatique, Keio University). All Rights
14 Contributing Author(s):
16 Dave Raggett <dsr@w3.org>
17 Andy Quick <ac.quick@sympatico.ca> (translation to Java)
19 The contributing author(s) would like to thank all those who
20 helped with testing, bug fixes, and patience. This wouldn't
21 have been possible without all of you.
25 This software and documentation is provided "as is," and
26 the copyright holders and contributing author(s) make no
27 representations or warranties, express or implied, including
28 but not limited to, warranties of merchantability or fitness
29 for any particular purpose or that the use of the software or
30 documentation will not infringe any third party patents,
31 copyrights, trademarks or other rights.
33 The copyright holders and contributing author(s) will not be
34 liable for any direct, indirect, special or consequential damages
35 arising out of any use of the software or documentation, even if
36 advised of the possibility of such damage.
38 Permission is hereby granted to use, copy, modify, and distribute
39 this source code, or portions hereof, documentation and executables,
40 for any purpose, without fee, subject to the following restrictions:
42 1. The origin of this source code must not be misrepresented.
43 2. Altered versions must be plainly marked as such and must
44 not be misrepresented as being the original source.
45 3. This Copyright notice may not be removed or altered from any
46 source or altered source distribution.
48 The copyright holders and contributing author(s) specifically
49 permit, without fee, and encourage the use of this source code
50 as a component for supporting the Hypertext Markup Language in
51 commercial products. If you use this source code in a product,
52 acknowledgment is not required but would be appreciated.
57 import java.io.FileInputStream;
58 import java.io.FileNotFoundException;
59 import java.io.FileOutputStream;
60 import java.io.FileWriter;
61 import java.io.IOException;
62 import java.io.InputStream;
63 import java.io.OutputStream;
64 import java.io.PrintWriter;
65 import java.util.Properties;
67 import org.eclipse.core.resources.IFile;
68 import org.eclipse.core.resources.IMarker;
69 import org.eclipse.core.runtime.CoreException;
73 * <p>HTML parser and pretty printer</p>
76 * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
77 * See Tidy.java for the copyright notice.
78 * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
79 * HTML Tidy Release 4 Aug 2000</a>
83 * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
84 * Institute of Technology, Institut National de Recherche en
85 * Informatique et en Automatique, Keio University). All Rights
90 * Contributing Author(s):<br>
91 * <a href="mailto:dsr@w3.org">Dave Raggett</a><br>
92 * <a href="mailto:ac.quick@sympatico.ca">Andy Quick</a> (translation to Java)
96 * The contributing author(s) would like to thank all those who
97 * helped with testing, bug fixes, and patience. This wouldn't
98 * have been possible without all of you.
102 * COPYRIGHT NOTICE:<br>
104 * This software and documentation is provided "as is," and
105 * the copyright holders and contributing author(s) make no
106 * representations or warranties, express or implied, including
107 * but not limited to, warranties of merchantability or fitness
108 * for any particular purpose or that the use of the software or
109 * documentation will not infringe any third party patents,
110 * copyrights, trademarks or other rights.
114 * The copyright holders and contributing author(s) will not be
115 * liable for any direct, indirect, special or consequential damages
116 * arising out of any use of the software or documentation, even if
117 * advised of the possibility of such damage.
121 * Permission is hereby granted to use, copy, modify, and distribute
122 * this source code, or portions hereof, documentation and executables,
123 * for any purpose, without fee, subject to the following restrictions:
128 * <li>The origin of this source code must not be misrepresented.</li>
129 * <li>Altered versions must be plainly marked as such and must
130 * not be misrepresented as being the original source.</li>
131 * <li>This Copyright notice may not be removed or altered from any
132 * source or altered source distribution.</li>
137 * The copyright holders and contributing author(s) specifically
138 * permit, without fee, and encourage the use of this source code
139 * as a component for supporting the Hypertext Markup Language in
140 * commercial products. If you use this source code in a product,
141 * acknowledgment is not required but would be appreciated.
144 * @author Dave Raggett <dsr@w3.org>
145 * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
146 * @version 1.0, 1999/05/22
147 * @version 1.0.1, 1999/05/29
148 * @version 1.1, 1999/06/18 Java Bean
149 * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
150 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
151 * @version 1.4, 1999/09/04 DOM support
152 * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
153 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
154 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
155 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
156 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
157 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
158 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
162 public class Tidy implements java.io.Serializable {
164 static final long serialVersionUID = -2794371560623987718L;
166 private boolean initialized = false;
167 private PrintWriter errout = null; /* error output stream */
168 private PrintWriter stderr = null;
169 private Configuration configuration = null;
170 private String inputStreamName = "InputStream";
171 private int parseErrors = 0;
172 private int parseWarnings = 0;
178 public Configuration getConfiguration() {
179 return configuration;
182 public PrintWriter getStderr() {
187 * ParseErrors - the number of errors that occurred in the most
188 * recent parse operation
191 public int getParseErrors() {
196 * ParseWarnings - the number of warnings that occurred in the most
197 * recent parse operation
200 public int getParseWarnings() {
201 return parseWarnings;
205 * Errout - the error output stream
208 public PrintWriter getErrout() {
212 public void setErrout(PrintWriter errout) {
213 this.errout = errout;
217 * Spaces - default indentation
218 * @see org.w3c.tidy.Configuration#spaces
221 public void setSpaces(int spaces) {
222 configuration.spaces = spaces;
225 public int getSpaces() {
226 return configuration.spaces;
230 * Wraplen - default wrap margin
231 * @see org.w3c.tidy.Configuration#wraplen
234 public void setWraplen(int wraplen) {
235 configuration.wraplen = wraplen;
238 public int getWraplen() {
239 return configuration.wraplen;
244 * @see org.w3c.tidy.Configuration#CharEncoding
247 public void setCharEncoding(int charencoding) {
248 configuration.CharEncoding = charencoding;
251 public int getCharEncoding() {
252 return configuration.CharEncoding;
257 * @see org.w3c.tidy.Configuration#tabsize
260 public void setTabsize(int tabsize) {
261 configuration.tabsize = tabsize;
264 public int getTabsize() {
265 return configuration.tabsize;
269 * Errfile - file name to write errors to
270 * @see org.w3c.tidy.Configuration#errfile
273 public void setErrfile(String errfile) {
274 configuration.errfile = errfile;
277 public String getErrfile() {
278 return configuration.errfile;
282 * Writeback - if true then output tidied markup
283 * NOTE: this property is ignored when parsing from an InputStream.
284 * @see org.w3c.tidy.Configuration#writeback
287 public void setWriteback(boolean writeback) {
288 configuration.writeback = writeback;
291 public boolean getWriteback() {
292 return configuration.writeback;
296 * OnlyErrors - if true normal output is suppressed
297 * @see org.w3c.tidy.Configuration#OnlyErrors
300 public void setOnlyErrors(boolean OnlyErrors) {
301 configuration.OnlyErrors = OnlyErrors;
304 public boolean getOnlyErrors() {
305 return configuration.OnlyErrors;
309 * ShowWarnings - however errors are always shown
310 * @see org.w3c.tidy.Configuration#ShowWarnings
313 public void setShowWarnings(boolean ShowWarnings) {
314 configuration.ShowWarnings = ShowWarnings;
317 public boolean getShowWarnings() {
318 return configuration.ShowWarnings;
322 * Quiet - no 'Parsing X', guessed DTD or summary
323 * @see org.w3c.tidy.Configuration#Quiet
326 public void setQuiet(boolean Quiet) {
327 configuration.Quiet = Quiet;
330 public boolean getQuiet() {
331 return configuration.Quiet;
335 * IndentContent - indent content of appropriate tags
336 * @see org.w3c.tidy.Configuration#IndentContent
339 public void setIndentContent(boolean IndentContent) {
340 configuration.IndentContent = IndentContent;
343 public boolean getIndentContent() {
344 return configuration.IndentContent;
348 * SmartIndent - does text/block level content effect indentation
349 * @see org.w3c.tidy.Configuration#SmartIndent
352 public void setSmartIndent(boolean SmartIndent) {
353 configuration.SmartIndent = SmartIndent;
356 public boolean getSmartIndent() {
357 return configuration.SmartIndent;
361 * HideEndTags - suppress optional end tags
362 * @see org.w3c.tidy.Configuration#HideEndTags
365 public void setHideEndTags(boolean HideEndTags) {
366 configuration.HideEndTags = HideEndTags;
369 public boolean getHideEndTags() {
370 return configuration.HideEndTags;
374 * XmlTags - treat input as XML
375 * @see org.w3c.tidy.Configuration#XmlTags
378 public void setXmlTags(boolean XmlTags) {
379 configuration.XmlTags = XmlTags;
382 public boolean getXmlTags() {
383 return configuration.XmlTags;
387 * XmlOut - create output as XML
388 * @see org.w3c.tidy.Configuration#XmlOut
391 public void setXmlOut(boolean XmlOut) {
392 configuration.XmlOut = XmlOut;
395 public boolean getXmlOut() {
396 return configuration.XmlOut;
400 * XHTML - output extensible HTML
401 * @see org.w3c.tidy.Configuration#xHTML
404 public void setXHTML(boolean xHTML) {
405 configuration.xHTML = xHTML;
408 public boolean getXHTML() {
409 return configuration.xHTML;
413 * RawOut - avoid mapping values > 127 to entities
414 * @see org.w3c.tidy.Configuration#RawOut
417 public void setRawOut(boolean RawOut) {
418 configuration.RawOut = RawOut;
421 public boolean getRawOut() {
422 return configuration.RawOut;
426 * UpperCaseTags - output tags in upper not lower case
427 * @see org.w3c.tidy.Configuration#UpperCaseTags
430 public void setUpperCaseTags(boolean UpperCaseTags) {
431 configuration.UpperCaseTags = UpperCaseTags;
434 public boolean getUpperCaseTags() {
435 return configuration.UpperCaseTags;
439 * UpperCaseAttrs - output attributes in upper not lower case
440 * @see org.w3c.tidy.Configuration#UpperCaseAttrs
443 public void setUpperCaseAttrs(boolean UpperCaseAttrs) {
444 configuration.UpperCaseAttrs = UpperCaseAttrs;
447 public boolean getUpperCaseAttrs() {
448 return configuration.UpperCaseAttrs;
452 * MakeClean - remove presentational clutter
453 * @see org.w3c.tidy.Configuration#MakeClean
456 public void setMakeClean(boolean MakeClean) {
457 configuration.MakeClean = MakeClean;
460 public boolean getMakeClean() {
461 return configuration.MakeClean;
465 * BreakBeforeBR - o/p newline before <br> or not?
466 * @see org.w3c.tidy.Configuration#BreakBeforeBR
469 public void setBreakBeforeBR(boolean BreakBeforeBR) {
470 configuration.BreakBeforeBR = BreakBeforeBR;
473 public boolean getBreakBeforeBR() {
474 return configuration.BreakBeforeBR;
478 * BurstSlides - create slides on each h2 element
479 * @see org.w3c.tidy.Configuration#BurstSlides
482 public void setBurstSlides(boolean BurstSlides) {
483 configuration.BurstSlides = BurstSlides;
486 public boolean getBurstSlides() {
487 return configuration.BurstSlides;
491 * NumEntities - use numeric entities
492 * @see org.w3c.tidy.Configuration#NumEntities
495 public void setNumEntities(boolean NumEntities) {
496 configuration.NumEntities = NumEntities;
499 public boolean getNumEntities() {
500 return configuration.NumEntities;
504 * QuoteMarks - output " marks as &quot;
505 * @see org.w3c.tidy.Configuration#QuoteMarks
508 public void setQuoteMarks(boolean QuoteMarks) {
509 configuration.QuoteMarks = QuoteMarks;
512 public boolean getQuoteMarks() {
513 return configuration.QuoteMarks;
517 * QuoteNbsp - output non-breaking space as entity
518 * @see org.w3c.tidy.Configuration#QuoteNbsp
521 public void setQuoteNbsp(boolean QuoteNbsp) {
522 configuration.QuoteNbsp = QuoteNbsp;
525 public boolean getQuoteNbsp() {
526 return configuration.QuoteNbsp;
530 * QuoteAmpersand - output naked ampersand as &
531 * @see org.w3c.tidy.Configuration#QuoteAmpersand
534 public void setQuoteAmpersand(boolean QuoteAmpersand) {
535 configuration.QuoteAmpersand = QuoteAmpersand;
538 public boolean getQuoteAmpersand() {
539 return configuration.QuoteAmpersand;
543 * WrapAttVals - wrap within attribute values
544 * @see org.w3c.tidy.Configuration#WrapAttVals
547 public void setWrapAttVals(boolean WrapAttVals) {
548 configuration.WrapAttVals = WrapAttVals;
551 public boolean getWrapAttVals() {
552 return configuration.WrapAttVals;
556 * WrapScriptlets - wrap within JavaScript string literals
557 * @see org.w3c.tidy.Configuration#WrapScriptlets
560 public void setWrapScriptlets(boolean WrapScriptlets) {
561 configuration.WrapScriptlets = WrapScriptlets;
564 public boolean getWrapScriptlets() {
565 return configuration.WrapScriptlets;
569 * WrapSection - wrap within <![ ... ]> section tags
570 * @see org.w3c.tidy.Configuration#WrapSection
573 public void setWrapSection(boolean WrapSection) {
574 configuration.WrapSection = WrapSection;
577 public boolean getWrapSection() {
578 return configuration.WrapSection;
582 * AltText - default text for alt attribute
583 * @see org.w3c.tidy.Configuration#altText
586 public void setAltText(String altText) {
587 configuration.altText = altText;
590 public String getAltText() {
591 return configuration.altText;
595 * Slidestyle - style sheet for slides
596 * @see org.w3c.tidy.Configuration#slidestyle
599 public void setSlidestyle(String slidestyle) {
600 configuration.slidestyle = slidestyle;
603 public String getSlidestyle() {
604 return configuration.slidestyle;
608 * XmlPi - add <?xml?> for XML docs
609 * @see org.w3c.tidy.Configuration#XmlPi
612 public void setXmlPi(boolean XmlPi) {
613 configuration.XmlPi = XmlPi;
616 public boolean getXmlPi() {
617 return configuration.XmlPi;
621 * DropFontTags - discard presentation tags
622 * @see org.w3c.tidy.Configuration#DropFontTags
625 public void setDropFontTags(boolean DropFontTags) {
626 configuration.DropFontTags = DropFontTags;
629 public boolean getDropFontTags() {
630 return configuration.DropFontTags;
634 * DropEmptyParas - discard empty p elements
635 * @see org.w3c.tidy.Configuration#DropEmptyParas
638 public void setDropEmptyParas(boolean DropEmptyParas) {
639 configuration.DropEmptyParas = DropEmptyParas;
642 public boolean getDropEmptyParas() {
643 return configuration.DropEmptyParas;
647 * FixComments - fix comments with adjacent hyphens
648 * @see org.w3c.tidy.Configuration#FixComments
651 public void setFixComments(boolean FixComments) {
652 configuration.FixComments = FixComments;
655 public boolean getFixComments() {
656 return configuration.FixComments;
660 * WrapAsp - wrap within ASP pseudo elements
661 * @see org.w3c.tidy.Configuration#WrapAsp
664 public void setWrapAsp(boolean WrapAsp) {
665 configuration.WrapAsp = WrapAsp;
668 public boolean getWrapAsp() {
669 return configuration.WrapAsp;
673 * WrapJste - wrap within JSTE pseudo elements
674 * @see org.w3c.tidy.Configuration#WrapJste
677 public void setWrapJste(boolean WrapJste) {
678 configuration.WrapJste = WrapJste;
681 public boolean getWrapJste() {
682 return configuration.WrapJste;
686 * WrapPhp - wrap within PHP pseudo elements
687 * @see org.w3c.tidy.Configuration#WrapPhp
690 public void setWrapPhp(boolean WrapPhp) {
691 configuration.WrapPhp = WrapPhp;
694 public boolean getWrapPhp() {
695 return configuration.WrapPhp;
699 * FixBackslash - fix URLs by replacing \ with /
700 * @see org.w3c.tidy.Configuration#FixBackslash
703 public void setFixBackslash(boolean FixBackslash) {
704 configuration.FixBackslash = FixBackslash;
707 public boolean getFixBackslash() {
708 return configuration.FixBackslash;
712 * IndentAttributes - newline+indent before each attribute
713 * @see org.w3c.tidy.Configuration#IndentAttributes
716 public void setIndentAttributes(boolean IndentAttributes) {
717 configuration.IndentAttributes = IndentAttributes;
720 public boolean getIndentAttributes() {
721 return configuration.IndentAttributes;
725 * DocType - user specified doctype
726 * omit | auto | strict | loose | <i>fpi</i>
727 * where the <i>fpi</i> is a string similar to
728 * "-//ACME//DTD HTML 3.14159//EN"
729 * Note: for <i>fpi</i> include the double-quotes in the string.
730 * @see org.w3c.tidy.Configuration#docTypeStr
731 * @see org.w3c.tidy.Configuration#docTypeMode
734 public void setDocType(String doctype) {
736 configuration.docTypeStr = configuration.parseDocType(doctype, "doctype");
739 public String getDocType() {
740 String result = null;
741 switch (configuration.docTypeMode) {
742 case Configuration.DOCTYPE_OMIT :
745 case Configuration.DOCTYPE_AUTO :
748 case Configuration.DOCTYPE_STRICT :
751 case Configuration.DOCTYPE_LOOSE :
754 case Configuration.DOCTYPE_USER :
755 result = configuration.docTypeStr;
762 * LogicalEmphasis - replace i by em and b by strong
763 * @see org.w3c.tidy.Configuration#LogicalEmphasis
766 public void setLogicalEmphasis(boolean LogicalEmphasis) {
767 configuration.LogicalEmphasis = LogicalEmphasis;
770 public boolean getLogicalEmphasis() {
771 return configuration.LogicalEmphasis;
775 * XmlPIs - if set to true PIs must end with ?>
776 * @see org.w3c.tidy.Configuration#XmlPIs
779 public void setXmlPIs(boolean XmlPIs) {
780 configuration.XmlPIs = XmlPIs;
783 public boolean getXmlPIs() {
784 return configuration.XmlPIs;
788 * EncloseText - if true text at body is wrapped in <p>'s
789 * @see org.w3c.tidy.Configuration#EncloseBodyText
792 public void setEncloseText(boolean EncloseText) {
793 configuration.EncloseBodyText = EncloseText;
796 public boolean getEncloseText() {
797 return configuration.EncloseBodyText;
801 * EncloseBlockText - if true text in blocks is wrapped in <p>'s
802 * @see org.w3c.tidy.Configuration#EncloseBlockText
805 public void setEncloseBlockText(boolean EncloseBlockText) {
806 configuration.EncloseBlockText = EncloseBlockText;
809 public boolean getEncloseBlockText() {
810 return configuration.EncloseBlockText;
814 * KeepFileTimes - if true last modified time is preserved<br>
815 * <b>this is NOT supported at this time.</b>
816 * @see org.w3c.tidy.Configuration#KeepFileTimes
819 public void setKeepFileTimes(boolean KeepFileTimes) {
820 configuration.KeepFileTimes = KeepFileTimes;
823 public boolean getKeepFileTimes() {
824 return configuration.KeepFileTimes;
828 * Word2000 - draconian cleaning for Word2000
829 * @see org.w3c.tidy.Configuration#Word2000
832 public void setWord2000(boolean Word2000) {
833 configuration.Word2000 = Word2000;
836 public boolean getWord2000() {
837 return configuration.Word2000;
841 * TidyMark - add meta element indicating tidied doc
842 * @see org.w3c.tidy.Configuration#TidyMark
845 public void setTidyMark(boolean TidyMark) {
846 configuration.TidyMark = TidyMark;
849 public boolean getTidyMark() {
850 return configuration.TidyMark;
854 * XmlSpace - if set to yes adds xml:space attr as needed
855 * @see org.w3c.tidy.Configuration#XmlSpace
858 public void setXmlSpace(boolean XmlSpace) {
859 configuration.XmlSpace = XmlSpace;
862 public boolean getXmlSpace() {
863 return configuration.XmlSpace;
867 * Emacs - if true format error output for GNU Emacs
868 * @see org.w3c.tidy.Configuration#Emacs
871 public void setEmacs(boolean Emacs) {
872 configuration.Emacs = Emacs;
875 public boolean getEmacs() {
876 return configuration.Emacs;
880 * LiteralAttribs - if true attributes may use newlines
881 * @see org.w3c.tidy.Configuration#LiteralAttribs
884 public void setLiteralAttribs(boolean LiteralAttribs) {
885 configuration.LiteralAttribs = LiteralAttribs;
888 public boolean getLiteralAttribs() {
889 return configuration.LiteralAttribs;
893 * InputStreamName - the name of the input stream (printed in the
894 * header information).
896 public void setInputStreamName(String name) {
898 inputStreamName = name;
901 public String getInputStreamName() {
902 return inputStreamName;
906 * Sets the configuration from a configuration file.
909 public void setConfigurationFromFile(String filename) {
910 configuration.parseFile(filename);
914 * Sets the configuration from a properties object.
917 public void setConfigurationFromProps(Properties props) {
918 configuration.addProps(props);
922 * first time initialization which should
923 * precede reading the command line
926 private void init() {
927 configuration = new Configuration();
928 if (configuration == null)
931 AttributeTable at = AttributeTable.getDefaultAttributeTable();
934 TagTable tt = new TagTable();
937 tt.setConfiguration(configuration);
938 configuration.tt = tt;
939 EntityTable et = EntityTable.getDefaultEntityTable();
943 /* Unnecessary - same initial values in Configuration
944 Configuration.XmlTags = false;
945 Configuration.XmlOut = false;
946 Configuration.HideEndTags = false;
947 Configuration.UpperCaseTags = false;
948 Configuration.MakeClean = false;
949 Configuration.writeback = false;
950 Configuration.OnlyErrors = false;
953 configuration.errfile = null;
954 stderr = new PrintWriter(System.err, true);
960 * Parses InputStream in and returns the root Node.
961 * If out is non-null, pretty prints to OutputStream out.
964 public Node parse(IFile iFile, InputStream in, OutputStream out) {
965 Node document = null;
968 iFile.deleteMarkers(IMarker.PROBLEM, false, 0);
969 document = parse(iFile, in, null, out);
970 } catch (CoreException e) {
971 } catch (FileNotFoundException fnfe) {
972 } catch (IOException e) {
979 * Internal routine that actually does the parsing. The caller
980 * can pass either an InputStream or file name. If both are passed,
981 * the file name is preferred.
984 private Node parse(IFile iFile, InputStream in, String file, OutputStream out) throws FileNotFoundException, IOException {
986 Node document = null;
988 Out o = new OutImpl(); /* normal output stream */
1000 /* ensure config is self-consistent */
1001 configuration.adjust();
1004 in = new FileInputStream(file);
1005 inputStreamName = file;
1006 } else if (in == null) {
1008 inputStreamName = "stdin";
1012 lexer = new Lexer(iFile,new StreamInImpl(in, configuration.CharEncoding, configuration.tabsize), configuration);
1013 lexer.errout = errout;
1016 store pointer to lexer in input stream
1017 to allow character encoding errors to be
1020 lexer.in.lexer = lexer;
1022 /* Tidy doesn't alter the doctype for generic XML docs */
1023 if (configuration.XmlTags)
1024 document = ParserImpl.parseXMLDocument(lexer);
1027 if (!configuration.Quiet)
1028 Report.helloMessage(errout, Report.RELEASE_DATE, inputStreamName);
1030 document = ParserImpl.parseDocument(lexer);
1032 if (!document.checkNodeIntegrity()) {
1033 Report.badTree(errout);
1037 Clean cleaner = new Clean(configuration.tt);
1039 /* simplifies <b><b> ... </b> ...</b> etc. */
1040 cleaner.nestedEmphasis(document);
1042 /* cleans up <dir>indented text</dir> etc. */
1043 cleaner.list2BQ(document);
1044 cleaner.bQ2Div(document);
1046 /* replaces i by em and b by strong */
1047 if (configuration.LogicalEmphasis)
1048 cleaner.emFromI(document);
1050 if (configuration.Word2000 && cleaner.isWord2000(document, configuration.tt)) {
1051 /* prune Word2000's <![if ...]> ... <![endif]> */
1052 cleaner.dropSections(lexer, document);
1054 /* drop style & class attributes and empty p, span elements */
1055 cleaner.cleanWord2000(lexer, document);
1058 /* replaces presentational markup by style rules */
1059 if (configuration.MakeClean || configuration.DropFontTags)
1060 cleaner.cleanTree(lexer, document);
1062 if (!document.checkNodeIntegrity()) {
1063 Report.badTree(errout);
1066 doctype = document.findDocType();
1067 if (document.content != null) {
1068 if (configuration.xHTML)
1069 lexer.setXHTMLDocType(document);
1071 lexer.fixDocType(document);
1073 if (configuration.TidyMark)
1074 lexer.addGenerator(document);
1077 /* ensure presence of initial <?XML version="1.0"?> */
1078 if (configuration.XmlOut && configuration.XmlPi)
1079 lexer.fixXMLPI(document);
1081 if (!configuration.Quiet && document.content != null) {
1082 Report.reportVersion(errout, lexer, inputStreamName, doctype);
1083 Report.reportNumWarnings(errout, lexer);
1087 parseWarnings = lexer.warnings;
1088 parseErrors = lexer.errors;
1090 // Try to close the InputStream but only if if we created it.
1092 if ((file != null) && (in != System.in)) {
1095 } catch (IOException e) {
1099 if (lexer.errors > 0)
1100 Report.needsAuthorIntervention(errout);
1102 o.state = StreamIn.FSM_ASCII;
1103 o.encoding = configuration.CharEncoding;
1105 if (!configuration.OnlyErrors && lexer.errors == 0) {
1106 if (configuration.BurstSlides) {
1111 remove doctype to avoid potential clash with
1112 markup introduced when bursting into slides
1114 /* discard the document type */
1115 doctype = document.findDocType();
1117 if (doctype != null)
1118 Node.discardElement(doctype);
1120 /* slides use transitional features */
1121 lexer.versions |= Dict.VERS_HTML40_LOOSE;
1123 /* and patch up doctype to match */
1124 if (configuration.xHTML)
1125 lexer.setXHTMLDocType(document);
1127 lexer.fixDocType(document);
1129 /* find the body element which may be implicit */
1130 body = document.findBody(configuration.tt);
1133 pprint = new PPrint(configuration);
1134 Report.reportNumberOfSlides(errout, pprint.countSlides(body));
1135 pprint.createSlides(lexer, document);
1137 Report.missingBody(errout);
1138 } else if (configuration.writeback && (file != null)) {
1140 pprint = new PPrint(configuration);
1141 o.out = new FileOutputStream(file);
1143 if (configuration.XmlTags)
1144 pprint.printXMLTree(o, (short) 0, 0, lexer, document);
1146 pprint.printTree(o, (short) 0, 0, lexer, document);
1148 pprint.flushLine(o, 0);
1150 } catch (IOException e) {
1151 errout.println(file + e.toString());
1153 } else if (out != null) {
1154 pprint = new PPrint(configuration);
1157 if (configuration.XmlTags)
1158 pprint.printXMLTree(o, (short) 0, 0, lexer, document);
1160 pprint.printTree(o, (short) 0, 0, lexer, document);
1162 pprint.flushLine(o, 0);
1167 Report.errorSummary(lexer);
1173 * Parses InputStream in and returns a DOM Document node.
1174 * If out is non-null, pretty prints to OutputStream out.
1177 public org.w3c.dom.Document parseDOM(IFile file, InputStream in, OutputStream out) {
1178 Node document = parse(file, in, out);
1179 if (document != null)
1180 return (org.w3c.dom.Document) document.getAdapter();
1186 * Creates an empty DOM Document.
1189 public static org.w3c.dom.Document createEmptyDocument() {
1190 Node document = new Node(Node.RootNode, new byte[0], 0, 0);
1191 Node node = new Node(Node.StartTag, new byte[0], 0, 0, "html", new TagTable());
1192 if (document != null && node != null) {
1193 Node.insertNodeAtStart(document, node);
1194 return (org.w3c.dom.Document) document.getAdapter();
1201 * Pretty-prints a DOM Document.
1204 public void pprint(org.w3c.dom.Document doc, OutputStream out) {
1205 Out o = new OutImpl();
1209 if (!(doc instanceof DOMDocumentImpl)) {
1212 document = ((DOMDocumentImpl) doc).adaptee;
1214 o.state = StreamIn.FSM_ASCII;
1215 o.encoding = configuration.CharEncoding;
1218 pprint = new PPrint(configuration);
1221 if (configuration.XmlTags)
1222 pprint.printXMLTree(o, (short) 0, 0, null, document);
1224 pprint.printTree(o, (short) 0, 0, null, document);
1226 pprint.flushLine(o, 0);
1231 * Command line interface to parser and pretty printer.
1234 public static void main(String[] argv) {
1235 int totalerrors = 0;
1236 int totalwarnings = 0;
1239 String prog = "Tidy";
1244 Out out = new OutImpl(); /* normal output stream */
1246 int argc = argv.length + 1;
1249 Configuration configuration;
1251 String current_errorfile = "stderr";
1254 configuration = tidy.getConfiguration();
1256 /* read command line */
1259 if (argc > 1 && argv[argIndex].startsWith("-")) {
1260 /* support -foo and --foo */
1261 arg = argv[argIndex].substring(1);
1263 if (arg.length() > 0 && arg.charAt(0) == '-')
1264 arg = arg.substring(1);
1266 if (arg.equals("xml"))
1267 configuration.XmlTags = true;
1268 else if (arg.equals("asxml") || arg.equals("asxhtml"))
1269 configuration.xHTML = true;
1270 else if (arg.equals("indent")) {
1271 configuration.IndentContent = true;
1272 configuration.SmartIndent = true;
1273 } else if (arg.equals("omit"))
1274 configuration.HideEndTags = true;
1275 else if (arg.equals("upper"))
1276 configuration.UpperCaseTags = true;
1277 else if (arg.equals("clean"))
1278 configuration.MakeClean = true;
1279 else if (arg.equals("raw"))
1280 configuration.CharEncoding = Configuration.RAW;
1281 else if (arg.equals("ascii"))
1282 configuration.CharEncoding = Configuration.ASCII;
1283 else if (arg.equals("latin1"))
1284 configuration.CharEncoding = Configuration.LATIN1;
1285 else if (arg.equals("utf8"))
1286 configuration.CharEncoding = Configuration.UTF8;
1287 else if (arg.equals("iso2022"))
1288 configuration.CharEncoding = Configuration.ISO2022;
1289 else if (arg.equals("mac"))
1290 configuration.CharEncoding = Configuration.MACROMAN;
1291 else if (arg.equals("numeric"))
1292 configuration.NumEntities = true;
1293 else if (arg.equals("modify"))
1294 configuration.writeback = true;
1295 else if (arg.equals("change")) /* obsolete */
1296 configuration.writeback = true;
1297 else if (arg.equals("update")) /* obsolete */
1298 configuration.writeback = true;
1299 else if (arg.equals("errors"))
1300 configuration.OnlyErrors = true;
1301 else if (arg.equals("quiet"))
1302 configuration.Quiet = true;
1303 else if (arg.equals("slides"))
1304 configuration.BurstSlides = true;
1305 else if (arg.equals("help") || argv[argIndex].charAt(1) == '?' || argv[argIndex].charAt(1) == 'h') {
1306 Report.helpText(new PrintWriter(System.out, true), prog);
1308 } else if (arg.equals("config")) {
1310 configuration.parseFile(argv[argIndex + 1]);
1314 } else if (argv[argIndex].equals("-file") || argv[argIndex].equals("--file") || argv[argIndex].equals("-f")) {
1316 configuration.errfile = argv[argIndex + 1];
1320 } else if (argv[argIndex].equals("-wrap") || argv[argIndex].equals("--wrap") || argv[argIndex].equals("-w")) {
1322 configuration.wraplen = Integer.parseInt(argv[argIndex + 1]);
1326 } else if (argv[argIndex].equals("-version") || argv[argIndex].equals("--version") || argv[argIndex].equals("-v")) {
1327 Report.showVersion(tidy.getErrout());
1332 for (int i = 1; i < s.length(); i++) {
1333 if (s.charAt(i) == 'i') {
1334 configuration.IndentContent = true;
1335 configuration.SmartIndent = true;
1336 } else if (s.charAt(i) == 'o')
1337 configuration.HideEndTags = true;
1338 else if (s.charAt(i) == 'u')
1339 configuration.UpperCaseTags = true;
1340 else if (s.charAt(i) == 'c')
1341 configuration.MakeClean = true;
1342 else if (s.charAt(i) == 'n')
1343 configuration.NumEntities = true;
1344 else if (s.charAt(i) == 'm')
1345 configuration.writeback = true;
1346 else if (s.charAt(i) == 'e')
1347 configuration.OnlyErrors = true;
1348 else if (s.charAt(i) == 'q')
1349 configuration.Quiet = true;
1351 Report.unknownOption(tidy.getErrout(), s.charAt(i));
1360 /* ensure config is self-consistent */
1361 configuration.adjust();
1363 /* user specified error file */
1364 if (configuration.errfile != null) {
1365 /* is it same as the currently opened file? */
1366 if (!configuration.errfile.equals(current_errorfile)) {
1367 /* no so close previous error file */
1369 if (tidy.getErrout() != tidy.getStderr())
1370 tidy.getErrout().close();
1372 /* and try to open the new error file */
1374 tidy.setErrout(new PrintWriter(new FileWriter(configuration.errfile), true));
1375 current_errorfile = configuration.errfile;
1376 } catch (IOException e) {
1377 /* can't be opened so fall back to stderr */
1378 current_errorfile = "stderr";
1379 tidy.setErrout(tidy.getStderr());
1385 file = argv[argIndex];
1391 document = tidy.parse(null, null, file, System.out);
1392 totalwarnings += tidy.parseWarnings;
1393 totalerrors += tidy.parseErrors;
1394 } catch (FileNotFoundException fnfe) {
1395 Report.unknownFile(tidy.getErrout(), prog, file);
1396 } catch (IOException ioe) {
1397 Report.unknownFile(tidy.getErrout(), prog, file);
1407 if (totalerrors + totalwarnings > 0)
1408 Report.generalInfo(tidy.getErrout());
1410 if (tidy.getErrout() != tidy.getStderr())
1411 tidy.getErrout().close();
1413 /* return status can be used by scripts */
1415 if (totalerrors > 0)
1418 if (totalwarnings > 0)
1421 /* 0 signifies all is ok */