RSS news reader; initially copied from "all the news"
[phpeclipse.git] / archive / net.sourceforge.phpeclipse.jtidy / src / net / sourceforge / phpdt / tidy / w3c / Tidy.java
1 /*
2  * @(#)Tidy.java   1.11 2000/08/16
3  *
4  */
5
6 /*
7   HTML parser and pretty printer
8
9   Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
10   Institute of Technology, Institut National de Recherche en
11   Informatique et en Automatique, Keio University). All Rights
12   Reserved.
13
14   Contributing Author(s):
15
16      Dave Raggett <dsr@w3.org>
17      Andy Quick <ac.quick@sympatico.ca> (translation to Java)
18
19   The contributing author(s) would like to thank all those who
20   helped with testing, bug fixes, and patience.  This wouldn't
21   have been possible without all of you.
22
23   COPYRIGHT NOTICE:
24  
25   This software and documentation is provided "as is," and
26   the copyright holders and contributing author(s) make no
27   representations or warranties, express or implied, including
28   but not limited to, warranties of merchantability or fitness
29   for any particular purpose or that the use of the software or
30   documentation will not infringe any third party patents,
31   copyrights, trademarks or other rights. 
32
33   The copyright holders and contributing author(s) will not be
34   liable for any direct, indirect, special or consequential damages
35   arising out of any use of the software or documentation, even if
36   advised of the possibility of such damage.
37
38   Permission is hereby granted to use, copy, modify, and distribute
39   this source code, or portions hereof, documentation and executables,
40   for any purpose, without fee, subject to the following restrictions:
41
42   1. The origin of this source code must not be misrepresented.
43   2. Altered versions must be plainly marked as such and must
44      not be misrepresented as being the original source.
45   3. This Copyright notice may not be removed or altered from any
46      source or altered source distribution.
47  
48   The copyright holders and contributing author(s) specifically
49   permit, without fee, and encourage the use of this source code
50   as a component for supporting the Hypertext Markup Language in
51   commercial products. If you use this source code in a product,
52   acknowledgment is not required but would be appreciated.
53 */
54
55 package net.sourceforge.phpdt.tidy.w3c;
56
57 import java.io.FileInputStream;
58 import java.io.FileNotFoundException;
59 import java.io.FileOutputStream;
60 import java.io.FileWriter;
61 import java.io.IOException;
62 import java.io.InputStream;
63 import java.io.OutputStream;
64 import java.io.PrintWriter;
65 import java.util.Properties;
66
67 import org.eclipse.core.resources.IFile;
68
69 /**
70  *
71  * <p>HTML parser and pretty printer</p>
72  *
73  * <p>
74  * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
75  * See Tidy.java for the copyright notice.
76  * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
77  * HTML Tidy Release 4 Aug 2000</a>
78  * </p>
79  *
80  * <p>
81  * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
82  * Institute of Technology, Institut National de Recherche en
83  * Informatique et en Automatique, Keio University). All Rights
84  * Reserved.
85  * </p>
86  *
87  * <p>
88  * Contributing Author(s):<br>
89  *    <a href="mailto:dsr@w3.org">Dave Raggett</a><br>
90  *    <a href="mailto:ac.quick@sympatico.ca">Andy Quick</a> (translation to Java)
91  * </p>
92  *
93  * <p>
94  * The contributing author(s) would like to thank all those who
95  * helped with testing, bug fixes, and patience.  This wouldn't
96  * have been possible without all of you.
97  * </p>
98  *
99  * <p>
100  * COPYRIGHT NOTICE:<br>
101  * 
102  * This software and documentation is provided "as is," and
103  * the copyright holders and contributing author(s) make no
104  * representations or warranties, express or implied, including
105  * but not limited to, warranties of merchantability or fitness
106  * for any particular purpose or that the use of the software or
107  * documentation will not infringe any third party patents,
108  * copyrights, trademarks or other rights. 
109  * </p>
110  *
111  * <p>
112  * The copyright holders and contributing author(s) will not be
113  * liable for any direct, indirect, special or consequential damages
114  * arising out of any use of the software or documentation, even if
115  * advised of the possibility of such damage.
116  * </p>
117  *
118  * <p>
119  * Permission is hereby granted to use, copy, modify, and distribute
120  * this source code, or portions hereof, documentation and executables,
121  * for any purpose, without fee, subject to the following restrictions:
122  * </p>
123  *
124  * <p>
125  * <ol>
126  * <li>The origin of this source code must not be misrepresented.</li>
127  * <li>Altered versions must be plainly marked as such and must
128  * not be misrepresented as being the original source.</li>
129  * <li>This Copyright notice may not be removed or altered from any
130  * source or altered source distribution.</li>
131  * </ol>
132  * </p>
133  *
134  * <p>
135  * The copyright holders and contributing author(s) specifically
136  * permit, without fee, and encourage the use of this source code
137  * as a component for supporting the Hypertext Markup Language in
138  * commercial products. If you use this source code in a product,
139  * acknowledgment is not required but would be appreciated.
140  * </p>
141  *
142  * @author  Dave Raggett <dsr@w3.org>
143  * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
144  * @version 1.0, 1999/05/22
145  * @version 1.0.1, 1999/05/29
146  * @version 1.1, 1999/06/18 Java Bean
147  * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
148  * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
149  * @version 1.4, 1999/09/04 DOM support
150  * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
151  * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
152  * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
153  * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
154  * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
155  * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
156  * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
157  *
158  */
159
160 public class Tidy implements java.io.Serializable {
161
162         static final long serialVersionUID = -2794371560623987718L;
163
164         private boolean initialized = false;
165         private PrintWriter errout = null; /* error output stream */
166         private PrintWriter stderr = null;
167         private Configuration configuration = null;
168         private String inputStreamName = "InputStream";
169         private int parseErrors = 0;
170         private int parseWarnings = 0;
171
172         public Tidy() {
173                 init();
174         }
175
176         public Configuration getConfiguration() {
177                 return configuration;
178         }
179
180         public PrintWriter getStderr() {
181                 return stderr;
182         }
183
184         /**
185          * ParseErrors - the number of errors that occurred in the most
186          * recent parse operation
187          */
188
189         public int getParseErrors() {
190                 return parseErrors;
191         }
192
193         /**
194          * ParseWarnings - the number of warnings that occurred in the most
195          * recent parse operation
196          */
197
198         public int getParseWarnings() {
199                 return parseWarnings;
200         }
201
202         /**
203          * Errout - the error output stream
204          */
205
206         public PrintWriter getErrout() {
207                 return errout;
208         }
209
210         public void setErrout(PrintWriter errout) {
211                 this.errout = errout;
212         }
213
214         /**
215          * Spaces - default indentation
216          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#spaces
217          */
218
219         public void setSpaces(int spaces) {
220                 configuration.spaces = spaces;
221         }
222
223         public int getSpaces() {
224                 return configuration.spaces;
225         }
226
227         /**
228          * Wraplen - default wrap margin
229          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#wraplen
230          */
231
232         public void setWraplen(int wraplen) {
233                 configuration.wraplen = wraplen;
234         }
235
236         public int getWraplen() {
237                 return configuration.wraplen;
238         }
239
240         /**
241          * CharEncoding
242          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#CharEncoding
243          */
244
245         public void setCharEncoding(int charencoding) {
246                 configuration.CharEncoding = charencoding;
247         }
248
249         public int getCharEncoding() {
250                 return configuration.CharEncoding;
251         }
252
253         /**
254          * Tabsize
255          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#tabsize
256          */
257
258         public void setTabsize(int tabsize) {
259                 configuration.tabsize = tabsize;
260         }
261
262         public int getTabsize() {
263                 return configuration.tabsize;
264         }
265
266         /**
267          * Errfile - file name to write errors to
268          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#errfile
269          */
270
271         public void setErrfile(String errfile) {
272                 configuration.errfile = errfile;
273         }
274
275         public String getErrfile() {
276                 return configuration.errfile;
277         }
278
279         /**
280          * Writeback - if true then output tidied markup
281          * NOTE: this property is ignored when parsing from an InputStream.
282          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#writeback
283          */
284
285         public void setWriteback(boolean writeback) {
286                 configuration.writeback = writeback;
287         }
288
289         public boolean getWriteback() {
290                 return configuration.writeback;
291         }
292
293         /**
294          * OnlyErrors - if true normal output is suppressed
295          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#OnlyErrors
296          */
297
298         public void setOnlyErrors(boolean OnlyErrors) {
299                 configuration.OnlyErrors = OnlyErrors;
300         }
301
302         public boolean getOnlyErrors() {
303                 return configuration.OnlyErrors;
304         }
305
306         /**
307          * ShowWarnings - however errors are always shown
308          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#ShowWarnings
309          */
310
311         public void setShowWarnings(boolean ShowWarnings) {
312                 configuration.ShowWarnings = ShowWarnings;
313         }
314
315         public boolean getShowWarnings() {
316                 return configuration.ShowWarnings;
317         }
318
319         /**
320          * Quiet - no 'Parsing X', guessed DTD or summary
321          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#Quiet
322          */
323
324         public void setQuiet(boolean Quiet) {
325                 configuration.Quiet = Quiet;
326         }
327
328         public boolean getQuiet() {
329                 return configuration.Quiet;
330         }
331
332         /**
333          * IndentContent - indent content of appropriate tags
334          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#IndentContent
335          */
336
337         public void setIndentContent(boolean IndentContent) {
338                 configuration.IndentContent = IndentContent;
339         }
340
341         public boolean getIndentContent() {
342                 return configuration.IndentContent;
343         }
344
345         /**
346          * SmartIndent - does text/block level content effect indentation
347          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#SmartIndent
348          */
349
350         public void setSmartIndent(boolean SmartIndent) {
351                 configuration.SmartIndent = SmartIndent;
352         }
353
354         public boolean getSmartIndent() {
355                 return configuration.SmartIndent;
356         }
357
358         /**
359          * HideEndTags - suppress optional end tags
360          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#HideEndTags
361          */
362
363         public void setHideEndTags(boolean HideEndTags) {
364                 configuration.HideEndTags = HideEndTags;
365         }
366
367         public boolean getHideEndTags() {
368                 return configuration.HideEndTags;
369         }
370
371         /**
372          * XmlTags - treat input as XML
373          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#XmlTags
374          */
375
376         public void setXmlTags(boolean XmlTags) {
377                 configuration.XmlTags = XmlTags;
378         }
379
380         public boolean getXmlTags() {
381                 return configuration.XmlTags;
382         }
383
384         /**
385          * XmlOut - create output as XML
386          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#XmlOut
387          */
388
389         public void setXmlOut(boolean XmlOut) {
390                 configuration.XmlOut = XmlOut;
391         }
392
393         public boolean getXmlOut() {
394                 return configuration.XmlOut;
395         }
396
397         /**
398          * XHTML - output extensible HTML
399          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#xHTML
400          */
401
402         public void setXHTML(boolean xHTML) {
403                 configuration.xHTML = xHTML;
404         }
405
406         public boolean getXHTML() {
407                 return configuration.xHTML;
408         }
409
410         /**
411          * RawOut - avoid mapping values > 127 to entities
412          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#RawOut
413          */
414
415         public void setRawOut(boolean RawOut) {
416                 configuration.RawOut = RawOut;
417         }
418
419         public boolean getRawOut() {
420                 return configuration.RawOut;
421         }
422
423         /**
424          * UpperCaseTags - output tags in upper not lower case
425          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#UpperCaseTags
426          */
427
428         public void setUpperCaseTags(boolean UpperCaseTags) {
429                 configuration.UpperCaseTags = UpperCaseTags;
430         }
431
432         public boolean getUpperCaseTags() {
433                 return configuration.UpperCaseTags;
434         }
435
436         /**
437          * UpperCaseAttrs - output attributes in upper not lower case
438          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#UpperCaseAttrs
439          */
440
441         public void setUpperCaseAttrs(boolean UpperCaseAttrs) {
442                 configuration.UpperCaseAttrs = UpperCaseAttrs;
443         }
444
445         public boolean getUpperCaseAttrs() {
446                 return configuration.UpperCaseAttrs;
447         }
448
449         /**
450          * MakeClean - remove presentational clutter
451          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#MakeClean
452          */
453
454         public void setMakeClean(boolean MakeClean) {
455                 configuration.MakeClean = MakeClean;
456         }
457
458         public boolean getMakeClean() {
459                 return configuration.MakeClean;
460         }
461
462         /**
463          * BreakBeforeBR - o/p newline before &lt;br&gt; or not?
464          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#BreakBeforeBR
465          */
466
467         public void setBreakBeforeBR(boolean BreakBeforeBR) {
468                 configuration.BreakBeforeBR = BreakBeforeBR;
469         }
470
471         public boolean getBreakBeforeBR() {
472                 return configuration.BreakBeforeBR;
473         }
474
475         /**
476          * BurstSlides - create slides on each h2 element
477          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#BurstSlides
478          */
479
480         public void setBurstSlides(boolean BurstSlides) {
481                 configuration.BurstSlides = BurstSlides;
482         }
483
484         public boolean getBurstSlides() {
485                 return configuration.BurstSlides;
486         }
487
488         /**
489          * NumEntities - use numeric entities
490          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#NumEntities
491          */
492
493         public void setNumEntities(boolean NumEntities) {
494                 configuration.NumEntities = NumEntities;
495         }
496
497         public boolean getNumEntities() {
498                 return configuration.NumEntities;
499         }
500
501         /**
502          * QuoteMarks - output " marks as &amp;quot;
503          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#QuoteMarks
504          */
505
506         public void setQuoteMarks(boolean QuoteMarks) {
507                 configuration.QuoteMarks = QuoteMarks;
508         }
509
510         public boolean getQuoteMarks() {
511                 return configuration.QuoteMarks;
512         }
513
514         /**
515          * QuoteNbsp - output non-breaking space as entity
516          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#QuoteNbsp
517          */
518
519         public void setQuoteNbsp(boolean QuoteNbsp) {
520                 configuration.QuoteNbsp = QuoteNbsp;
521         }
522
523         public boolean getQuoteNbsp() {
524                 return configuration.QuoteNbsp;
525         }
526
527         /**
528          * QuoteAmpersand - output naked ampersand as &amp;
529          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#QuoteAmpersand
530          */
531
532         public void setQuoteAmpersand(boolean QuoteAmpersand) {
533                 configuration.QuoteAmpersand = QuoteAmpersand;
534         }
535
536         public boolean getQuoteAmpersand() {
537                 return configuration.QuoteAmpersand;
538         }
539
540         /**
541          * WrapAttVals - wrap within attribute values
542          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#WrapAttVals
543          */
544
545         public void setWrapAttVals(boolean WrapAttVals) {
546                 configuration.WrapAttVals = WrapAttVals;
547         }
548
549         public boolean getWrapAttVals() {
550                 return configuration.WrapAttVals;
551         }
552
553         /**
554          * WrapScriptlets - wrap within JavaScript string literals
555          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#WrapScriptlets
556          */
557
558         public void setWrapScriptlets(boolean WrapScriptlets) {
559                 configuration.WrapScriptlets = WrapScriptlets;
560         }
561
562         public boolean getWrapScriptlets() {
563                 return configuration.WrapScriptlets;
564         }
565
566         /**
567          * WrapSection - wrap within &lt;![ ... ]&gt; section tags
568          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#WrapSection
569          */
570
571         public void setWrapSection(boolean WrapSection) {
572                 configuration.WrapSection = WrapSection;
573         }
574
575         public boolean getWrapSection() {
576                 return configuration.WrapSection;
577         }
578
579         /**
580          * AltText - default text for alt attribute
581          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#altText
582          */
583
584         public void setAltText(String altText) {
585                 configuration.altText = altText;
586         }
587
588         public String getAltText() {
589                 return configuration.altText;
590         }
591
592         /**
593          * Slidestyle - style sheet for slides
594          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#slidestyle
595          */
596
597         public void setSlidestyle(String slidestyle) {
598                 configuration.slidestyle = slidestyle;
599         }
600
601         public String getSlidestyle() {
602                 return configuration.slidestyle;
603         }
604
605         /**
606          * XmlPi - add &lt;?xml?&gt; for XML docs
607          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#XmlPi
608          */
609
610         public void setXmlPi(boolean XmlPi) {
611                 configuration.XmlPi = XmlPi;
612         }
613
614         public boolean getXmlPi() {
615                 return configuration.XmlPi;
616         }
617
618         /**
619          * DropFontTags - discard presentation tags
620          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#DropFontTags
621          */
622
623         public void setDropFontTags(boolean DropFontTags) {
624                 configuration.DropFontTags = DropFontTags;
625         }
626
627         public boolean getDropFontTags() {
628                 return configuration.DropFontTags;
629         }
630
631   //gschadow patch start
632   /**
633     * Remove all those stupid pseudo-XML tags. If only XSLT had been
634     * around 5 years earlier, we wouldn't need to bother about all
635     * this ASP, JSP, PHP and other sh..! 
636     *
637     * @see net.sourceforge.phpdt.tidy.w3c.Configuration#DropFontTags 
638     */
639   public void setDropPseudoXMLCrap(boolean DropPseudoXMLCrap) {
640      configuration.DropPseudoXMLCrap = DropPseudoXMLCrap;
641   }
642   //gschadow patch end
643  
644   public boolean getDropPseudoXMLCrap() {
645        return configuration.DropPseudoXMLCrap;
646   }
647  
648         /**
649          * DropEmptyParas - discard empty p elements
650          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#DropEmptyParas
651          */
652
653         public void setDropEmptyParas(boolean DropEmptyParas) {
654                 configuration.DropEmptyParas = DropEmptyParas;
655         }
656
657         public boolean getDropEmptyParas() {
658                 return configuration.DropEmptyParas;
659         }
660
661         /**
662          * FixComments - fix comments with adjacent hyphens
663          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#FixComments
664          */
665
666         public void setFixComments(boolean FixComments) {
667                 configuration.FixComments = FixComments;
668         }
669
670         public boolean getFixComments() {
671                 return configuration.FixComments;
672         }
673
674         /**
675          * WrapAsp - wrap within ASP pseudo elements
676          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#WrapAsp
677          */
678
679         public void setWrapAsp(boolean WrapAsp) {
680                 configuration.WrapAsp = WrapAsp;
681         }
682
683         public boolean getWrapAsp() {
684                 return configuration.WrapAsp;
685         }
686
687         /**
688          * WrapJste - wrap within JSTE pseudo elements
689          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#WrapJste
690          */
691
692         public void setWrapJste(boolean WrapJste) {
693                 configuration.WrapJste = WrapJste;
694         }
695
696         public boolean getWrapJste() {
697                 return configuration.WrapJste;
698         }
699
700         /**
701          * WrapPhp - wrap within PHP pseudo elements
702          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#WrapPhp
703          */
704
705         public void setWrapPhp(boolean WrapPhp) {
706                 configuration.WrapPhp = WrapPhp;
707         }
708
709         public boolean getWrapPhp() {
710                 return configuration.WrapPhp;
711         }
712
713         /**
714          * FixBackslash - fix URLs by replacing \ with /
715          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#FixBackslash
716          */
717
718         public void setFixBackslash(boolean FixBackslash) {
719                 configuration.FixBackslash = FixBackslash;
720         }
721
722         public boolean getFixBackslash() {
723                 return configuration.FixBackslash;
724         }
725
726         /**
727          * IndentAttributes - newline+indent before each attribute
728          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#IndentAttributes
729          */
730
731         public void setIndentAttributes(boolean IndentAttributes) {
732                 configuration.IndentAttributes = IndentAttributes;
733         }
734
735         public boolean getIndentAttributes() {
736                 return configuration.IndentAttributes;
737         }
738
739         /**
740          * DocType - user specified doctype
741          * omit | auto | strict | loose | <i>fpi</i>
742          * where the <i>fpi</i> is a string similar to
743          *    &quot;-//ACME//DTD HTML 3.14159//EN&quot;
744          * Note: for <i>fpi</i> include the double-quotes in the string.
745          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#docTypeStr
746          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#docTypeMode
747          */
748
749         public void setDocType(String doctype) {
750                 if (doctype != null)
751                         configuration.docTypeStr = configuration.parseDocType(doctype, "doctype");
752         }
753
754         public String getDocType() {
755                 String result = null;
756                 switch (configuration.docTypeMode) {
757                         case Configuration.DOCTYPE_OMIT :
758                                 result = "omit";
759                                 break;
760                         case Configuration.DOCTYPE_AUTO :
761                                 result = "auto";
762                                 break;
763                         case Configuration.DOCTYPE_STRICT :
764                                 result = "strict";
765                                 break;
766                         case Configuration.DOCTYPE_LOOSE :
767                                 result = "loose";
768                                 break;
769                         case Configuration.DOCTYPE_USER :
770                                 result = configuration.docTypeStr;
771                                 break;
772                 }
773                 return result;
774         }
775
776         /**
777          * LogicalEmphasis - replace i by em and b by strong
778          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#LogicalEmphasis
779          */
780
781         public void setLogicalEmphasis(boolean LogicalEmphasis) {
782                 configuration.LogicalEmphasis = LogicalEmphasis;
783         }
784
785         public boolean getLogicalEmphasis() {
786                 return configuration.LogicalEmphasis;
787         }
788
789         /**
790          * XmlPIs - if set to true PIs must end with ?>
791          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#XmlPIs
792          */
793
794         public void setXmlPIs(boolean XmlPIs) {
795                 configuration.XmlPIs = XmlPIs;
796         }
797
798         public boolean getXmlPIs() {
799                 return configuration.XmlPIs;
800         }
801
802         /**
803          * EncloseText - if true text at body is wrapped in &lt;p&gt;'s
804          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#EncloseBodyText
805          */
806
807         public void setEncloseText(boolean EncloseText) {
808                 configuration.EncloseBodyText = EncloseText;
809         }
810
811         public boolean getEncloseText() {
812                 return configuration.EncloseBodyText;
813         }
814
815         /**
816          * EncloseBlockText - if true text in blocks is wrapped in &lt;p&gt;'s
817          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#EncloseBlockText
818          */
819
820         public void setEncloseBlockText(boolean EncloseBlockText) {
821                 configuration.EncloseBlockText = EncloseBlockText;
822         }
823
824         public boolean getEncloseBlockText() {
825                 return configuration.EncloseBlockText;
826         }
827
828         /**
829          * KeepFileTimes - if true last modified time is preserved<br>
830          * <b>this is NOT supported at this time.</b>
831          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#KeepFileTimes
832          */
833
834         public void setKeepFileTimes(boolean KeepFileTimes) {
835                 configuration.KeepFileTimes = KeepFileTimes;
836         }
837
838         public boolean getKeepFileTimes() {
839                 return configuration.KeepFileTimes;
840         }
841
842         /**
843          * Word2000 - draconian cleaning for Word2000
844          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#Word2000
845          */
846
847         public void setWord2000(boolean Word2000) {
848                 configuration.Word2000 = Word2000;
849         }
850
851         public boolean getWord2000() {
852                 return configuration.Word2000;
853         }
854
855         /**
856          * TidyMark - add meta element indicating tidied doc
857          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#TidyMark
858          */
859
860         public void setTidyMark(boolean TidyMark) {
861                 configuration.TidyMark = TidyMark;
862         }
863
864         public boolean getTidyMark() {
865                 return configuration.TidyMark;
866         }
867
868         /**
869          * XmlSpace - if set to yes adds xml:space attr as needed
870          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#XmlSpace
871          */
872
873         public void setXmlSpace(boolean XmlSpace) {
874                 configuration.XmlSpace = XmlSpace;
875         }
876
877         public boolean getXmlSpace() {
878                 return configuration.XmlSpace;
879         }
880
881         /**
882          * Emacs - if true format error output for GNU Emacs
883          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#Emacs
884          */
885
886         public void setEmacs(boolean Emacs) {
887                 configuration.Emacs = Emacs;
888         }
889
890         public boolean getEmacs() {
891                 return configuration.Emacs;
892         }
893
894         /**
895          * LiteralAttribs - if true attributes may use newlines
896          * @see net.sourceforge.phpdt.tidy.w3c.Configuration#LiteralAttribs
897          */
898
899         public void setLiteralAttribs(boolean LiteralAttribs) {
900                 configuration.LiteralAttribs = LiteralAttribs;
901         }
902
903         public boolean getLiteralAttribs() {
904                 return configuration.LiteralAttribs;
905         }
906
907         /**
908          * InputStreamName - the name of the input stream (printed in the
909          * header information).
910          */
911         public void setInputStreamName(String name) {
912                 if (name != null)
913                         inputStreamName = name;
914         }
915
916         public String getInputStreamName() {
917                 return inputStreamName;
918         }
919
920         /**
921          * Sets the configuration from a configuration file.
922          */
923
924         public void setConfigurationFromFile(String filename) {
925                 configuration.parseFile(filename);
926         }
927
928         /**
929          * Sets the configuration from a properties object.
930          */
931
932         public void setConfigurationFromProps(Properties props) {
933                 configuration.addProps(props);
934         }
935
936         /**
937          * first time initialization which should
938          * precede reading the command line
939          */
940
941         private void init() {
942                 configuration = new Configuration();
943                 if (configuration == null)
944                         return;
945
946                 AttributeTable at = AttributeTable.getDefaultAttributeTable();
947                 if (at == null)
948                         return;
949                 TagTable tt = new TagTable();
950                 if (tt == null)
951                         return;
952                 tt.setConfiguration(configuration);
953                 configuration.tt = tt;
954                 EntityTable et = EntityTable.getDefaultEntityTable();
955                 if (et == null)
956                         return;
957
958                 /* Unnecessary - same initial values in Configuration
959                 Configuration.XmlTags       = false;
960                 Configuration.XmlOut        = false;
961                 Configuration.HideEndTags   = false;
962                 Configuration.UpperCaseTags = false;
963                 Configuration.MakeClean     = false;
964                 Configuration.writeback     = false;
965                 Configuration.OnlyErrors    = false;
966                 */
967
968                 configuration.errfile = null;
969                 stderr = new PrintWriter(System.err, true);
970                 errout = stderr;
971                 initialized = true;
972         }
973
974         /**
975          * Parses InputStream in and returns the root Node.
976          * If out is non-null, pretty prints to OutputStream out.
977          */
978
979         public Node parse(IFile iFile, InputStream in, OutputStream out) {
980                 Node document = null;
981
982                 try {
983                         document = parse(iFile, in, null, out);
984
985                 } catch (FileNotFoundException fnfe) {
986                 } catch (IOException e) {
987                 }
988
989                 return document;
990         }
991
992         /**
993          * Internal routine that actually does the parsing.  The caller
994          * can pass either an InputStream or file name.  If both are passed,
995          * the file name is preferred.
996          */
997
998         private Node parse(IFile iFile, InputStream in, String file, OutputStream out)
999                 throws FileNotFoundException, IOException {
1000                 Lexer lexer;
1001                 Node document = null;
1002                 Node doctype;
1003                 Out o = new OutImpl(); /* normal output stream */
1004                 PPrint pprint;
1005
1006                 if (!initialized)
1007                         return null;
1008
1009                 if (errout == null)
1010                         return null;
1011
1012                 parseErrors = 0;
1013                 parseWarnings = 0;
1014
1015                 /* ensure config is self-consistent */
1016                 configuration.adjust();
1017
1018                 if (file != null) {
1019                         in = new FileInputStream(file);
1020                         inputStreamName = file;
1021                 } else if (in == null) {
1022                         in = System.in;
1023                         inputStreamName = "stdin";
1024                 }
1025
1026                 if (in != null) {
1027                         lexer =
1028                                 new Lexer(
1029                                         iFile,
1030                                         new StreamInImpl(in, configuration.CharEncoding, configuration.tabsize),
1031                                         configuration);
1032                         lexer.errout = errout;
1033
1034                         /*
1035                           store pointer to lexer in input stream
1036                           to allow character encoding errors to be
1037                           reported
1038                         */
1039                         lexer.in.lexer = lexer;
1040
1041                         /* Tidy doesn't alter the doctype for generic XML docs */
1042                         if (configuration.XmlTags)
1043                                 document = ParserImpl.parseXMLDocument(lexer);
1044                         else {
1045                                 lexer.warnings = 0;
1046                                 if (!configuration.Quiet)
1047                                         Report.helloMessage(errout, Report.RELEASE_DATE, inputStreamName);
1048
1049                                 document = ParserImpl.parseDocument(lexer);
1050
1051                                 if (!document.checkNodeIntegrity()) {
1052                                         Report.badTree(errout);
1053                                         return null;
1054                                 }
1055
1056                                 Clean cleaner = new Clean(configuration.tt);
1057
1058                                 /* simplifies <b><b> ... </b> ...</b> etc. */
1059                                 cleaner.nestedEmphasis(document);
1060
1061                                 /* cleans up <dir>indented text</dir> etc. */
1062                                 cleaner.list2BQ(document);
1063                                 cleaner.bQ2Div(document);
1064
1065                                 /* replaces i by em and b by strong */
1066                                 if (configuration.LogicalEmphasis)
1067                                         cleaner.emFromI(document);
1068
1069                                 if (configuration.Word2000 && cleaner.isWord2000(document, configuration.tt)) {
1070                                         /* prune Word2000's <![if ...]> ... <![endif]> */
1071                                         cleaner.dropSections(lexer, document);
1072
1073                                         /* drop style & class attributes and empty p, span elements */
1074                                         cleaner.cleanWord2000(lexer, document);
1075                                 }
1076         
1077         //gschadow patch start
1078         if (configuration.DropPseudoXMLCrap) {
1079           cleaner.dropPseudoXMLCrap(lexer, document);
1080         }
1081         //gschadow patch end
1082          
1083                                 /* replaces presentational markup by style rules */
1084                                 if (configuration.MakeClean || configuration.DropFontTags)
1085                                         cleaner.cleanTree(lexer, document);
1086
1087                                 if (!document.checkNodeIntegrity()) {
1088                                         Report.badTree(errout);
1089                                         return null;
1090                                 }
1091                                 doctype = document.findDocType();
1092                                 if (document.content != null) {
1093                                         if (configuration.xHTML)
1094                                                 lexer.setXHTMLDocType(document);
1095                                         else
1096                                                 lexer.fixDocType(document);
1097
1098                                         if (configuration.TidyMark)
1099                                                 lexer.addGenerator(document);
1100                                 }
1101
1102                                 /* ensure presence of initial <?XML version="1.0"?> */
1103                                 if (configuration.XmlOut && configuration.XmlPi)
1104                                         lexer.fixXMLPI(document);
1105
1106                                 if (!configuration.Quiet && document.content != null) {
1107                                         Report.reportVersion(errout, lexer, inputStreamName, doctype);
1108                                         Report.reportNumWarnings(errout, lexer);
1109                                 }
1110                         }
1111
1112                         parseWarnings = lexer.warnings;
1113                         parseErrors = lexer.errors;
1114
1115                         // Try to close the InputStream but only if if we created it.
1116
1117                         if ((file != null) && (in != System.in)) {
1118                                 try {
1119                                         in.close();
1120                                 } catch (IOException e) {
1121                                 }
1122                         }
1123
1124                         if (lexer.errors > 0)
1125                                 Report.needsAuthorIntervention(errout);
1126
1127                         o.state = StreamIn.FSM_ASCII;
1128                         o.encoding = configuration.CharEncoding;
1129
1130                         if (!configuration.OnlyErrors && lexer.errors == 0) {
1131                                 if (configuration.BurstSlides) {
1132                                         Node body;
1133
1134                                         body = null;
1135                                         /*
1136                                            remove doctype to avoid potential clash with
1137                                            markup introduced when bursting into slides
1138                                         */
1139                                         /* discard the document type */
1140                                         doctype = document.findDocType();
1141
1142                                         if (doctype != null)
1143                                                 Node.discardElement(doctype);
1144
1145                                         /* slides use transitional features */
1146                                         lexer.versions |= Dict.VERS_HTML40_LOOSE;
1147
1148                                         /* and patch up doctype to match */
1149                                         if (configuration.xHTML)
1150                                                 lexer.setXHTMLDocType(document);
1151                                         else
1152                                                 lexer.fixDocType(document);
1153
1154                                         /* find the body element which may be implicit */
1155                                         body = document.findBody(configuration.tt);
1156
1157                                         if (body != null) {
1158                                                 pprint = new PPrint(configuration);
1159                                                 Report.reportNumberOfSlides(errout, pprint.countSlides(body));
1160                                                 pprint.createSlides(lexer, document);
1161                                         } else
1162                                                 Report.missingBody(errout);
1163                                 } else if (configuration.writeback && (file != null)) {
1164                                         try {
1165                                                 pprint = new PPrint(configuration);
1166                                                 o.out = new FileOutputStream(file);
1167
1168                                                 if (configuration.XmlTags)
1169                                                         pprint.printXMLTree(o, (short) 0, 0, lexer, document);
1170                                                 else
1171                                                         pprint.printTree(o, (short) 0, 0, lexer, document);
1172
1173                                                 pprint.flushLine(o, 0);
1174                                                 o.out.close();
1175                                         } catch (IOException e) {
1176                                                 errout.println(file + e.toString());
1177                                         }
1178                                 } else if (out != null) {
1179                                         pprint = new PPrint(configuration);
1180                                         o.out = out;
1181
1182                                         if (configuration.XmlTags)
1183                                                 pprint.printXMLTree(o, (short) 0, 0, lexer, document);
1184                                         else
1185                                                 pprint.printTree(o, (short) 0, 0, lexer, document);
1186
1187                                         pprint.flushLine(o, 0);
1188                                 }
1189
1190                         }
1191
1192                         Report.errorSummary(lexer);
1193                 }
1194                 return document;
1195         }
1196
1197         /**
1198          * Parses InputStream in and returns a DOM Document node.
1199          * If out is non-null, pretty prints to OutputStream out.
1200          */
1201
1202         public org.w3c.dom.Document parseDOM(IFile file, InputStream in, OutputStream out) {
1203                 Node document = parse(file, in, out);
1204                 if (document != null)
1205                         return (org.w3c.dom.Document) document.getAdapter();
1206                 else
1207                         return null;
1208         }
1209
1210         /**
1211          * Creates an empty DOM Document.
1212          */
1213
1214         public static org.w3c.dom.Document createEmptyDocument() {
1215                 Node document = new Node(Node.RootNode, new byte[0], 0, 0);
1216                 Node node = new Node(Node.StartTag, new byte[0], 0, 0, "html", new TagTable());
1217                 if (document != null && node != null) {
1218                         Node.insertNodeAtStart(document, node);
1219                         return (org.w3c.dom.Document) document.getAdapter();
1220                 } else {
1221                         return null;
1222                 }
1223         }
1224
1225         /**
1226          * Pretty-prints a DOM Document.
1227          */
1228
1229         public void pprint(org.w3c.dom.Document doc, OutputStream out) {
1230                 Out o = new OutImpl();
1231                 PPrint pprint;
1232                 Node document;
1233
1234                 if (!(doc instanceof DOMDocumentImpl)) {
1235                         return;
1236                 }
1237                 document = ((DOMDocumentImpl) doc).adaptee;
1238
1239                 o.state = StreamIn.FSM_ASCII;
1240                 o.encoding = configuration.CharEncoding;
1241
1242                 if (out != null) {
1243                         pprint = new PPrint(configuration);
1244                         o.out = out;
1245
1246                         if (configuration.XmlTags)
1247                                 pprint.printXMLTree(o, (short) 0, 0, null, document);
1248                         else
1249                                 pprint.printTree(o, (short) 0, 0, null, document);
1250
1251                         pprint.flushLine(o, 0);
1252                 }
1253         }
1254
1255         /**
1256          * Command line interface to parser and pretty printer.
1257          */
1258
1259         public static void main(String[] argv) {
1260                 int totalerrors = 0;
1261                 int totalwarnings = 0;
1262                 String file;
1263                 InputStream in;
1264                 String prog = "Tidy";
1265                 Node document;
1266                 Node doctype;
1267                 Lexer lexer;
1268                 String s;
1269                 Out out = new OutImpl(); /* normal output stream */
1270                 PPrint pprint;
1271                 int argc = argv.length + 1;
1272                 int argIndex = 0;
1273                 Tidy tidy;
1274                 Configuration configuration;
1275                 String arg;
1276                 String current_errorfile = "stderr";
1277
1278                 tidy = new Tidy();
1279                 configuration = tidy.getConfiguration();
1280
1281                 /* read command line */
1282
1283                 while (argc > 0) {
1284                         if (argc > 1 && argv[argIndex].startsWith("-")) {
1285                                 /* support -foo and --foo */
1286                                 arg = argv[argIndex].substring(1);
1287
1288                                 if (arg.length() > 0 && arg.charAt(0) == '-')
1289                                         arg = arg.substring(1);
1290
1291                                 if (arg.equals("xml"))
1292                                         configuration.XmlTags = true;
1293                                 else if (arg.equals("asxml") || arg.equals("asxhtml"))
1294                                         configuration.xHTML = true;
1295                                 else if (arg.equals("indent")) {
1296                                         configuration.IndentContent = true;
1297                                         configuration.SmartIndent = true;
1298                                 } else if (arg.equals("omit"))
1299                                         configuration.HideEndTags = true;
1300                                 else if (arg.equals("upper"))
1301                                         configuration.UpperCaseTags = true;
1302                                 else if (arg.equals("clean"))
1303                                         configuration.MakeClean = true;
1304                                 else if (arg.equals("raw"))
1305                                         configuration.CharEncoding = Configuration.RAW;
1306                                 else if (arg.equals("ascii"))
1307                                         configuration.CharEncoding = Configuration.ASCII;
1308                                 else if (arg.equals("latin1"))
1309                                         configuration.CharEncoding = Configuration.LATIN1;
1310                                 else if (arg.equals("utf8"))
1311                                         configuration.CharEncoding = Configuration.UTF8;
1312                                 else if (arg.equals("iso2022"))
1313                                         configuration.CharEncoding = Configuration.ISO2022;
1314                                 else if (arg.equals("mac"))
1315                                         configuration.CharEncoding = Configuration.MACROMAN;
1316                                 else if (arg.equals("numeric"))
1317                                         configuration.NumEntities = true;
1318                                 else if (arg.equals("modify"))
1319                                         configuration.writeback = true;
1320                                 else if (arg.equals("change")) /* obsolete */
1321                                         configuration.writeback = true;
1322                                 else if (arg.equals("update")) /* obsolete */
1323                                         configuration.writeback = true;
1324                                 else if (arg.equals("errors"))
1325                                         configuration.OnlyErrors = true;
1326                                 else if (arg.equals("quiet"))
1327                                         configuration.Quiet = true;
1328                                 else if (arg.equals("slides"))
1329                                         configuration.BurstSlides = true;
1330                                 else if (arg.equals("help") || argv[argIndex].charAt(1) == '?' || argv[argIndex].charAt(1) == 'h') {
1331                                         Report.helpText(new PrintWriter(System.out, true), prog);
1332                                         System.exit(1);
1333                                 } else if (arg.equals("config")) {
1334                                         if (argc >= 3) {
1335                                                 configuration.parseFile(argv[argIndex + 1]);
1336                                                 --argc;
1337                                                 ++argIndex;
1338                                         }
1339                                 } else if (
1340                                         argv[argIndex].equals("-file") || argv[argIndex].equals("--file") || argv[argIndex].equals("-f")) {
1341                                         if (argc >= 3) {
1342                                                 configuration.errfile = argv[argIndex + 1];
1343                                                 --argc;
1344                                                 ++argIndex;
1345                                         }
1346                                 } else if (
1347                                         argv[argIndex].equals("-wrap") || argv[argIndex].equals("--wrap") || argv[argIndex].equals("-w")) {
1348                                         if (argc >= 3) {
1349                                                 configuration.wraplen = Integer.parseInt(argv[argIndex + 1]);
1350                                                 --argc;
1351                                                 ++argIndex;
1352                                         }
1353                                 } else if (
1354                                         argv[argIndex].equals("-version")
1355                                                 || argv[argIndex].equals("--version")
1356                                                 || argv[argIndex].equals("-v")) {
1357                                         Report.showVersion(tidy.getErrout());
1358                                         System.exit(0);
1359                                 } else {
1360                                         s = argv[argIndex];
1361
1362                                         for (int i = 1; i < s.length(); i++) {
1363                                                 if (s.charAt(i) == 'i') {
1364                                                         configuration.IndentContent = true;
1365                                                         configuration.SmartIndent = true;
1366                                                 } else if (s.charAt(i) == 'o')
1367                                                         configuration.HideEndTags = true;
1368                                                 else if (s.charAt(i) == 'u')
1369                                                         configuration.UpperCaseTags = true;
1370                                                 else if (s.charAt(i) == 'c')
1371                                                         configuration.MakeClean = true;
1372                                                 else if (s.charAt(i) == 'n')
1373                                                         configuration.NumEntities = true;
1374                                                 else if (s.charAt(i) == 'm')
1375                                                         configuration.writeback = true;
1376                                                 else if (s.charAt(i) == 'e')
1377                                                         configuration.OnlyErrors = true;
1378                                                 else if (s.charAt(i) == 'q')
1379                                                         configuration.Quiet = true;
1380                                                 else
1381                                                         Report.unknownOption(tidy.getErrout(), s.charAt(i));
1382                                         }
1383                                 }
1384
1385                                 --argc;
1386                                 ++argIndex;
1387                                 continue;
1388                         }
1389
1390                         /* ensure config is self-consistent */
1391                         configuration.adjust();
1392
1393                         /* user specified error file */
1394                         if (configuration.errfile != null) {
1395                                 /* is it same as the currently opened file? */
1396                                 if (!configuration.errfile.equals(current_errorfile)) {
1397                                         /* no so close previous error file */
1398
1399                                         if (tidy.getErrout() != tidy.getStderr())
1400                                                 tidy.getErrout().close();
1401
1402                                         /* and try to open the new error file */
1403                                         try {
1404                                                 tidy.setErrout(new PrintWriter(new FileWriter(configuration.errfile), true));
1405                                                 current_errorfile = configuration.errfile;
1406                                         } catch (IOException e) {
1407                                                 /* can't be opened so fall back to stderr */
1408                                                 current_errorfile = "stderr";
1409                                                 tidy.setErrout(tidy.getStderr());
1410                                         }
1411                                 }
1412                         }
1413
1414                         if (argc > 1) {
1415                                 file = argv[argIndex];
1416                         } else {
1417                                 file = "stdin";
1418                         }
1419
1420                         try {
1421                                 document = tidy.parse(null, null, file, System.out);
1422                                 totalwarnings += tidy.parseWarnings;
1423                                 totalerrors += tidy.parseErrors;
1424                         } catch (FileNotFoundException fnfe) {
1425                                 Report.unknownFile(tidy.getErrout(), prog, file);
1426                         } catch (IOException ioe) {
1427                                 Report.unknownFile(tidy.getErrout(), prog, file);
1428                         }
1429
1430                         --argc;
1431                         ++argIndex;
1432
1433                         if (argc <= 1)
1434                                 break;
1435                 }
1436
1437                 if (totalerrors + totalwarnings > 0)
1438                         Report.generalInfo(tidy.getErrout());
1439
1440                 if (tidy.getErrout() != tidy.getStderr())
1441                         tidy.getErrout().close();
1442
1443                 /* return status can be used by scripts */
1444
1445                 if (totalerrors > 0)
1446                         System.exit(2);
1447
1448                 if (totalwarnings > 0)
1449                         System.exit(1);
1450
1451                 /* 0 signifies all is ok */
1452                 System.exit(0);
1453         }
1454 }