Fix for #1380415 (toshihiro)
[phpeclipse.git] / net.sourceforge.phpeclipse / src / net / sourceforge / phpdt / internal / ui / text / HTML2TextReader.java
1 package net.sourceforge.phpdt.internal.ui.text;
2
3 /*
4  * (c) Copyright IBM Corp. 2000, 2001.
5  * All Rights Reserved.
6  */
7
8 import java.io.IOException;
9 import java.io.PushbackReader;
10 import java.io.Reader;
11 import java.util.HashMap;
12 import java.util.HashSet;
13 import java.util.Map;
14 import java.util.Set;
15
16 import net.sourceforge.phpdt.internal.ui.PHPUIMessages;
17
18 import org.eclipse.jface.text.TextPresentation;
19 import org.eclipse.swt.SWT;
20 import org.eclipse.swt.custom.StyleRange;
21
22 /**
23  * Reads the text contents from a reader of HTML contents and translates the
24  * tags or cut them out.
25  */
26 public class HTML2TextReader extends SubstitutionTextReader {
27
28         private static final String LINE_DELIM = System.getProperty(
29                         "line.separator", "\n"); //$NON-NLS-1$ //$NON-NLS-2$
30
31         private static final String EMPTY_STRING = ""; //$NON-NLS-1$
32
33         private static final Map fgEntityLookup;
34
35         private static final Set fgTags;
36
37         static {
38
39                 fgTags = new HashSet();
40                 fgTags.add("b"); //$NON-NLS-1$
41                 fgTags.add("br"); //$NON-NLS-1$
42                 fgTags.add("h5"); //$NON-NLS-1$
43                 fgTags.add("p"); //$NON-NLS-1$
44                 fgTags.add("dl"); //$NON-NLS-1$
45                 fgTags.add("dt"); //$NON-NLS-1$
46                 fgTags.add("dd"); //$NON-NLS-1$
47                 fgTags.add("li"); //$NON-NLS-1$
48                 fgTags.add("ul"); //$NON-NLS-1$
49                 fgTags.add("pre"); //$NON-NLS-1$
50
51                 fgEntityLookup = new HashMap(7);
52                 fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$
53                 fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$
54                 fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$
55                 fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$
56                 fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$
57                 fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$
58                 fgEntityLookup.put("quot", "\""); //$NON-NLS-1$ //$NON-NLS-2$
59         }
60
61         private int fCounter = 0;
62
63         private TextPresentation fTextPresentation;
64
65         private int fBold = 0;
66
67         private int fStartOffset = -1;
68
69         private boolean fInParagraph = false;
70
71         private boolean fIsPreformattedText = false;
72
73         /**
74          * Transforms the html text from the reader to formatted text.
75          * 
76          * @param presentation
77          *            If not <code>null</code>, formattings will be applied to
78          *            the presentation.
79          */
80         public HTML2TextReader(Reader reader, TextPresentation presentation) {
81                 super(new PushbackReader(reader));
82                 fTextPresentation = presentation;
83         }
84
85         public int read() throws IOException {
86                 int c = super.read();
87                 if (c != -1)
88                         ++fCounter;
89                 return c;
90         }
91
92         protected void startBold() {
93                 if (fBold == 0)
94                         fStartOffset = fCounter;
95                 ++fBold;
96         }
97
98         protected void startPreformattedText() {
99                 fIsPreformattedText = true;
100                 setSkipWhitespace(false);
101         }
102
103         protected void stopPreformattedText() {
104                 fIsPreformattedText = false;
105                 setSkipWhitespace(true);
106         }
107
108         protected void stopBold() {
109                 --fBold;
110                 if (fBold == 0) {
111                         if (fTextPresentation != null) {
112                                 fTextPresentation.addStyleRange(new StyleRange(fStartOffset,
113                                                 fCounter - fStartOffset, null, null, SWT.BOLD));
114                         }
115                         fStartOffset = -1;
116                 }
117         }
118
119         /**
120          * @see SubstitutionTextReader#computeSubstitution(char)
121          */
122         protected String computeSubstitution(int c) throws IOException {
123
124                 if (c == '<')
125                         return processHTMLTag();
126                 else if (c == '&')
127                         return processEntity();
128                 else if (fIsPreformattedText)
129                         return processPreformattedText(c);
130
131                 return null;
132         }
133
134         private String html2Text(String html) {
135
136                 String tag = html;
137                 if ('/' == tag.charAt(0))
138                         tag = tag.substring(1);
139
140                 if (!fgTags.contains(tag))
141                         return EMPTY_STRING;
142
143                 if ("pre".equals(html)) { //$NON-NLS-1$
144                         startPreformattedText();
145                         return EMPTY_STRING;
146                 }
147
148                 if ("/pre".equals(html)) { //$NON-NLS-1$
149                         stopPreformattedText();
150                         return EMPTY_STRING;
151                 }
152
153                 if (fIsPreformattedText)
154                         return EMPTY_STRING;
155
156                 if ("b".equals(html)) { //$NON-NLS-1$
157                         startBold();
158                         return EMPTY_STRING;
159                 }
160
161                 if ("h5".equals(html) || "dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
162                         startBold();
163                         return EMPTY_STRING;
164                 }
165
166                 if ("dl".equals(html)) //$NON-NLS-1$
167                         return LINE_DELIM;
168
169                 if ("dd".equals(html)) //$NON-NLS-1$
170                         return "\t"; //$NON-NLS-1$
171
172                 if ("li".equals(html)) //$NON-NLS-1$
173                         return LINE_DELIM
174                                         + "\t" + PHPUIMessages.getString("HTML2TextReader.dash"); //$NON-NLS-1$ //$NON-NLS-2$
175
176                 if ("/b".equals(html)) { //$NON-NLS-1$
177                         stopBold();
178                         return EMPTY_STRING;
179                 }
180
181                 if ("p".equals(html)) { //$NON-NLS-1$
182                         fInParagraph = true;
183                         return LINE_DELIM;
184                 }
185
186                 if ("br".equals(html)) //$NON-NLS-1$
187                         return LINE_DELIM;
188
189                 if ("/p".equals(html)) { //$NON-NLS-1$
190                         boolean inParagraph = fInParagraph;
191                         fInParagraph = false;
192                         return inParagraph ? EMPTY_STRING : LINE_DELIM;
193                 }
194
195                 if ("/h5".equals(html) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
196                         stopBold();
197                         return LINE_DELIM;
198                 }
199
200                 if ("/dd".equals(html)) //$NON-NLS-1$
201                         return LINE_DELIM;
202
203                 return EMPTY_STRING;
204         }
205
206         /*
207          * A '<' has been read. Process a html tag
208          */
209         private String processHTMLTag() throws IOException {
210
211                 StringBuffer buf = new StringBuffer();
212                 int ch;
213                 do {
214
215                         ch = nextChar();
216
217                         while (ch != -1 && ch != '>') {
218                                 buf.append(Character.toLowerCase((char) ch));
219                                 ch = nextChar();
220                                 if (ch == '"') {
221                                         buf.append(Character.toLowerCase((char) ch));
222                                         ch = nextChar();
223                                         while (ch != -1 && ch != '"') {
224                                                 buf.append(Character.toLowerCase((char) ch));
225                                                 ch = nextChar();
226                                         }
227                                 }
228                                 if (ch == '<') {
229                                         unread(ch);
230                                         return '<' + buf.toString();
231                                 }
232                         }
233
234                         if (ch == -1)
235                                 return null;
236
237                         int tagLen = buf.length();
238                         // needs special treatment for comments
239                         if ((tagLen >= 3 && "!--".equals(buf.substring(0, 3))) //$NON-NLS-1$
240                                         && !(tagLen >= 5 && "--!".equals(buf.substring(tagLen - 3)))) { //$NON-NLS-1$
241                                 // unfinished comment
242                                 buf.append(ch);
243                         } else {
244                                 break;
245                         }
246                 } while (true);
247
248                 return html2Text(buf.toString());
249         }
250
251         private String processPreformattedText(int c) {
252                 if (c == '\r' || c == '\n')
253                         fCounter++;
254                 return null;
255         }
256
257         private void unread(int ch) throws IOException {
258                 ((PushbackReader) getReader()).unread(ch);
259         }
260
261         protected String entity2Text(String symbol) {
262                 if (symbol.length() > 1 && symbol.charAt(0) == '#') {
263                         int ch;
264                         try {
265                                 if (symbol.charAt(1) == 'x') {
266                                         ch = Integer.parseInt(symbol.substring(2), 16);
267                                 } else {
268                                         ch = Integer.parseInt(symbol.substring(1), 10);
269                                 }
270                                 return EMPTY_STRING + (char) ch;
271                         } catch (NumberFormatException e) {
272                         }
273                 } else {
274                         String str = (String) fgEntityLookup.get(symbol);
275                         if (str != null) {
276                                 return str;
277                         }
278                 }
279                 return "&" + symbol; // not found //$NON-NLS-1$
280         }
281
282         /*
283          * A '&' has been read. Process a entity
284          */
285         private String processEntity() throws IOException {
286                 StringBuffer buf = new StringBuffer();
287                 int ch = nextChar();
288                 while (Character.isLetterOrDigit((char) ch) || ch == '#') {
289                         buf.append((char) ch);
290                         ch = nextChar();
291                 }
292
293                 if (ch == ';')
294                         return entity2Text(buf.toString());
295
296                 buf.insert(0, '&');
297                 if (ch != -1)
298                         buf.append((char) ch);
299                 return buf.toString();
300         }
301 }