2 * @(#)Clean.java 1.11 2000/08/16
10 * Clean up misuse of presentation markup
12 * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
13 * See Tidy.java for the copyright notice.
14 * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
15 * HTML Tidy Release 4 Aug 2000</a>
17 * @author Dave Raggett <dsr@w3.org>
18 * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
19 * @version 1.0, 1999/05/22
20 * @version 1.0.1, 1999/05/29
21 * @version 1.1, 1999/06/18 Java Bean
22 * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
23 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
24 * @version 1.4, 1999/09/04 DOM support
25 * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
26 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
27 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
28 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
29 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
30 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
31 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
35 Filters from other formats such as Microsoft Word
36 often make excessive use of presentation markup such
37 as font tags, B, I, and the align attribute. By applying
38 a set of production rules, it is straight forward to
39 transform this to use CSS.
41 Some rules replace some of the children of an element by
42 style properties on the element, e.g.
44 <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
46 Such rules are applied to the element's content and then
47 to the element itself until none of the rules more apply.
48 Having applied all the rules to an element, it will have
49 a style attribute with one or more properties.
51 Other rules strip the element they apply to, replacing
52 it by style properties on the contents, e.g.
54 <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
56 These rules are applied to an element before processing
57 its content and replace the current element by the first
58 element in the exposed content.
60 After applying both sets of rules, you can replace the
61 style attribute by a class value and style rule in the
62 document head. To support this, an association of styles
63 and class names is built.
65 A naive approach is to rely on string matching to test
66 when two property lists are the same. A better approach
67 would be to first sort the properties before matching.
72 private int classNum = 1;
76 public Clean(TagTable tt)
81 private StyleProp insertProperty(StyleProp props, String name,
84 StyleProp first, prev, prop;
92 cmp = props.name.compareTo(name);
96 /* this property is already defined, ignore new value */
100 if (cmp > 0) // props.name > name
102 /* insert before this */
104 prop = new StyleProp(name, value, props);
118 prop = new StyleProp(name, value);
129 Create sorted linked list of properties from style string
130 It temporarily places nulls in place of ':' and ';' to
131 delimit the strings for the property name and value.
132 Some systems don't allow you to null literal strings,
133 so to avoid this, a copy is made first.
135 private StyleProp createProps(StyleProp prop, String style)
144 while (name_start < style.length())
146 while (name_start < style.length() &&
147 style.charAt(name_start) == ' ')
150 name_end = name_start;
152 while (name_end < style.length())
154 if (style.charAt(name_end) == ':')
156 value_start = name_end + 1;
163 if (name_end >= style.length() || style.charAt(name_end) != ':')
166 while (value_start < style.length() &&
167 style.charAt(value_start) == ' ')
170 value_end = value_start;
173 while (value_end < style.length())
175 if (style.charAt(value_end) == ';')
184 prop = insertProperty(prop,
185 style.substring(name_start, name_end),
186 style.substring(value_start, value_end));
190 name_start = value_end + 1;
200 private String createPropString(StyleProp props)
208 for (len = 0, prop = props; prop != null; prop = prop.next)
210 len += prop.name.length() + 2;
211 len += prop.value.length() + 2;
214 for (prop = props; prop != null; prop = prop.next)
216 style = style.concat(prop.name);
217 style = style.concat(": ");
219 style = style.concat(prop.value);
221 if (prop.next == null)
224 style = style.concat("; ");
231 create string with merged properties
233 private String addProperty(String style, String property)
237 prop = createProps(null, style);
238 prop = createProps(prop, property);
239 style = createPropString(prop);
243 private String gensymClass(String tag)
247 str = "c" + classNum;
252 private String findStyle(Lexer lexer, String tag, String properties)
256 for (style = lexer.styles; style != null; style=style.next)
258 if (style.tag.equals(tag) &&
259 style.properties.equals(properties))
260 return style.tagClass;
263 style = new Style(tag, gensymClass(tag), properties, lexer.styles);
264 lexer.styles = style;
265 return style.tagClass;
269 Find style attribute in node, and replace it
270 by corresponding class attribute. Search for
271 class in style dictionary otherwise gensym
272 new class and add to dictionary.
274 Assumes that node doesn't have a class attribute
276 private void style2Rule(Lexer lexer, Node node)
278 AttVal styleattr, classattr;
281 styleattr = node.getAttrByName("style");
283 if (styleattr != null)
285 classname = findStyle(lexer, node.element, styleattr.value);
286 classattr = node.getAttrByName("class");
289 if there already is a class attribute
290 then append class name after a space
292 if (classattr != null)
294 classattr.value = classattr.value + " " + classname;
295 node.removeAttribute(styleattr);
297 else /* reuse style attribute for class attribute */
299 styleattr.attribute = "class";
300 styleattr.value = classname;
305 private void addColorRule(Lexer lexer, String selector, String color)
309 lexer.addStringLiteral(selector);
310 lexer.addStringLiteral(" { color: ");
311 lexer.addStringLiteral(color);
312 lexer.addStringLiteral(" }\n");
317 move presentation attribs from body to style element
319 background="foo" -> body { background-image: url(foo) }
320 bgcolor="foo" -> body { background-color: foo }
321 text="foo" -> body { color: foo }
322 link="foo" -> :link { color: foo }
323 vlink="foo" -> :visited { color: foo }
324 alink="foo" -> :active { color: foo }
326 private void cleanBodyAttrs(Lexer lexer, Node body)
330 String bgcolor = null;
333 attr = body.getAttrByName("background");
339 body.removeAttribute(attr);
342 attr = body.getAttrByName("bgcolor");
346 bgcolor = attr.value;
348 body.removeAttribute(attr);
351 attr = body.getAttrByName("text");
357 body.removeAttribute(attr);
360 if (bgurl != null || bgcolor != null || color != null)
362 lexer.addStringLiteral(" body {\n");
366 lexer.addStringLiteral(" background-image: url(");
367 lexer.addStringLiteral(bgurl);
368 lexer.addStringLiteral(");\n");
373 lexer.addStringLiteral(" background-color: ");
374 lexer.addStringLiteral(bgcolor);
375 lexer.addStringLiteral(";\n");
380 lexer.addStringLiteral(" color: ");
381 lexer.addStringLiteral(color);
382 lexer.addStringLiteral(";\n");
385 lexer.addStringLiteral(" }\n");
388 attr = body.getAttrByName("link");
392 addColorRule(lexer, " :link", attr.value);
393 body.removeAttribute(attr);
396 attr = body.getAttrByName("vlink");
400 addColorRule(lexer, " :visited", attr.value);
401 body.removeAttribute(attr);
404 attr = body.getAttrByName("alink");
408 addColorRule(lexer, " :active", attr.value);
409 body.removeAttribute(attr);
413 private boolean niceBody(Lexer lexer, Node doc)
415 Node body = doc.findBody(lexer.configuration.tt);
420 body.getAttrByName("background") != null ||
421 body.getAttrByName("bgcolor") != null ||
422 body.getAttrByName("text") != null ||
423 body.getAttrByName("link") != null ||
424 body.getAttrByName("vlink") != null ||
425 body.getAttrByName("alink") != null
428 lexer.badLayout |= Report.USING_BODY;
436 /* create style element using rules from dictionary */
437 private void createStyleElement(Lexer lexer, Node doc)
439 Node node, head, body;
443 if (lexer.styles == null && niceBody(lexer, doc))
446 node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
447 node.implicit = true;
449 /* insert type attribute */
450 av = new AttVal(null, null, '"', "type", "text/css");
451 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
452 node.attributes = av;
454 body = doc.findBody(lexer.configuration.tt);
456 lexer.txtstart = lexer.lexsize;
459 cleanBodyAttrs(lexer, body);
461 for (style = lexer.styles; style != null; style = style.next)
463 lexer.addCharToLexer(' ');
464 lexer.addStringLiteral(style.tag);
465 lexer.addCharToLexer('.');
466 lexer.addStringLiteral(style.tagClass);
467 lexer.addCharToLexer(' ');
468 lexer.addCharToLexer('{');
469 lexer.addStringLiteral(style.properties);
470 lexer.addCharToLexer('}');
471 lexer.addCharToLexer('\n');
474 lexer.txtend = lexer.lexsize;
476 Node.insertNodeAtEnd(node,
477 lexer.newNode(Node.TextNode,
483 now insert style element into document head
485 doc is root node. search its children for html node
486 the head node should be first child of html node
489 head = doc.findHEAD(lexer.configuration.tt);
492 Node.insertNodeAtEnd(head, node);
495 /* ensure bidirectional links are consistent */
496 private void fixNodeLinks(Node node)
500 if (node.prev != null)
501 node.prev.next = node;
503 node.parent.content = node;
505 if (node.next != null)
506 node.next.prev = node;
508 node.parent.last = node;
510 for (child = node.content; child != null; child = child.next)
515 used to strip child of node when
516 the node has one and only one child
518 private void stripOnlyChild(Node node)
522 child = node.content;
523 node.content = child.content;
524 node.last = child.last;
525 child.content = null;
527 for (child = node.content; child != null; child = child.next)
531 /* used to strip font start and end tags */
532 private void discardContainer(Node element, MutableObject pnode)
535 Node parent = element.parent;
537 if (element.content != null)
539 element.last.next = element.next;
541 if (element.next != null)
543 element.next.prev = element.last;
544 element.last.next = element.next;
547 parent.last = element.last;
549 if (element.prev != null)
551 element.content.prev = element.prev;
552 element.prev.next = element.content;
555 parent.content = element.content;
557 for (node = element.content; node != null; node = node.next)
558 node.parent = parent;
560 pnode.setObject(element.content);
564 if (element.next != null)
565 element.next.prev = element.prev;
567 parent.last = element.prev;
569 if (element.prev != null)
570 element.prev.next = element.next;
572 parent.content = element.next;
574 pnode.setObject(element.next);
578 element.content = null;
582 Add style property to element, creating style
583 attribute as needed and adding ; delimiter
585 private void addStyleProperty(Node node, String property)
589 for (av = node.attributes; av != null; av = av.next)
591 if (av.attribute.equals("style"))
595 /* if style attribute already exists then insert property */
601 s = addProperty(av.value, property);
604 else /* else create new style attribute */
606 av = new AttVal(node.attributes, null, '"', "style", property);
607 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
608 node.attributes = av;
613 Create new string that consists of the
614 combined style properties in s1 and s2
616 To merge property lists, we build a linked
617 list of property/values and insert properties
618 into the list in order, merging values for
619 the same property name.
621 private String mergeProperties(String s1, String s2)
626 prop = createProps(null, s1);
627 prop = createProps(prop, s2);
628 s = createPropString(prop);
632 private void mergeStyles(Node node, Node child)
635 String s1, s2, style;
637 for (s2 = null, av = child.attributes; av != null; av = av.next)
639 if (av.attribute.equals("style"))
646 for (s1 = null, av = node.attributes; av != null; av = av.next)
648 if (av.attribute.equals("style"))
657 if (s2 != null) /* merge styles from both */
659 style = mergeProperties(s1, s2);
663 else if (s2 != null) /* copy style of child */
665 av = new AttVal(node.attributes, null, '"', "style", s2);
666 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
667 node.attributes = av;
671 private String fontSize2Name(String size)
698 if (size.length() > 0 &&
699 '0' <= size.charAt(0) && size.charAt(0) <= '6')
701 int n = size.charAt(0) - '0';
705 if (size.length() > 0 && size.charAt(0) == '-')
707 if (size.length() > 1 &&
708 '0' <= size.charAt(1) && size.charAt(1) <= '6')
710 int n = size.charAt(1) - '0';
713 for (x = 1.0; n > 0; --n)
717 buf = "" + (int)x + "%";
722 return "smaller"; /*"70%"; */
725 if (size.length() > 1 &&
726 '0' <= size.charAt(1) && size.charAt(1) <= '6')
728 int n = size.charAt(1) - '0';
731 for (x = 1.0; n > 0; --n)
735 buf = "" + (int)x + "%";
740 return "larger"; /* "140%" */
743 private void addFontFace(Node node, String face)
745 addStyleProperty(node, "font-family: " + face);
748 private void addFontSize(Node node, String size)
752 if (size.equals("6") && node.tag == tt.tagP)
759 if (size.equals("5") && node.tag == tt.tagP)
766 if (size.equals("4") && node.tag == tt.tagP)
773 value = fontSize2Name(size);
777 addStyleProperty(node, "font-size: " + value);
781 private void addFontColor(Node node, String color)
783 addStyleProperty(node, "color: " + color);
786 private void addAlign(Node node, String align)
788 /* force alignment value to lower case */
789 addStyleProperty(node, "text-align: " + align.toLowerCase());
793 add style properties to node corresponding to
794 the font face, size and color attributes
796 private void addFontStyles(Node node, AttVal av)
800 if (av.attribute.equals("face"))
801 addFontFace(node, av.value);
802 else if (av.attribute.equals("size"))
803 addFontSize(node, av.value);
804 else if (av.attribute.equals("color"))
805 addFontColor(node, av.value);
812 Symptom: <p align=center>
813 Action: <p style="text-align: center">
815 private void textAlign(Lexer lexer, Node node)
821 for (av = node.attributes; av != null; av = av.next)
823 if (av.attribute.equals("align"))
828 node.attributes = av.next;
830 if (av.value != null)
832 addAlign(node, av.value);
843 The clean up rules use the pnode argument to return the
844 next node when the orignal node has been deleted
848 Symptom: <dir> <li> where <li> is only child
849 Action: coerce <dir> <li> to <div> with indent.
852 private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode)
856 if (node.tag == tt.tagDir ||
857 node.tag == tt.tagUl ||
858 node.tag == tt.tagOl)
860 child = node.content;
865 /* check child has no peers */
867 if (child.next != null)
870 if (child.tag != tt.tagLi)
876 /* coerce dir to div */
878 node.tag = tt.tagDiv;
879 node.element = "div";
880 addStyleProperty(node, "margin-left: 2em");
881 stripOnlyChild(node);
887 //content = child.content;
889 //child.content = null;
891 /* adjust parent and set margin on contents of <li> */
893 //for (child = content; child != null; child = child.next)
895 // child.parent = node.parent;
896 // addStyleProperty(child, "margin-left: 1em");
899 /* hook first/last into sequence */
901 //if (content != null)
903 // content.prev = node.prev;
904 // last.next = node.next;
905 // fixNodeLinks(content);
906 // fixNodeLinks(last);
911 /* ensure that new node is cleaned */
912 //pnode.setObject(cleanNode(lexer, content));
922 Action: replace <center> by <div style="text-align: center">
925 private boolean center2Div(Lexer lexer, Node node, MutableObject pnode)
927 if (node.tag == tt.tagCenter)
929 if (lexer.configuration.DropFontTags)
931 if (node.content != null)
933 Node last = node.last;
934 Node parent = node.parent;
936 discardContainer(node, pnode);
938 node = lexer.inferredTag("br");
940 if (last.next != null)
941 last.next.prev = node;
943 node.next = last.next;
947 if (parent.last == last)
950 node.parent = parent;
954 Node prev = node.prev;
955 Node next = node.next;
956 Node parent = node.parent;
957 discardContainer(node, pnode);
959 node = lexer.inferredTag("br");
962 node.parent = parent;
972 parent.content = node;
977 node.tag = tt.tagDiv;
978 node.element = "div";
979 addStyleProperty(node, "text-align: center");
987 Symptom <div><div>...</div></div>
988 Action: merge the two divs
990 This is useful after nested <dir>s used by Word
991 for indenting have been converted to <div>s
993 private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode)
997 if (node.tag != tt.tagDiv)
1000 child = node.content;
1005 if (child.tag != tt.tagDiv)
1008 if (child.next != null)
1011 mergeStyles(node, child);
1012 stripOnlyChild(node);
1017 Symptom: <ul><li><ul>...</ul></li></ul>
1018 Action: discard outer list
1021 private boolean nestedList(Lexer lexer, Node node, MutableObject pnode)
1025 if (node.tag == tt.tagUl || node.tag == tt.tagOl)
1027 child = node.content;
1032 /* check child has no peers */
1034 if (child.next != null)
1037 list = child.content;
1042 if (list.tag != node.tag)
1045 pnode.setObject(node.next);
1047 /* move inner list node into position of outer node */
1048 list.prev = node.prev;
1049 list.next = node.next;
1050 list.parent = node.parent;
1053 /* get rid of outer ul and its li */
1054 child.content = null;
1055 node.content = null;
1059 If prev node was a list the chances are this node
1060 should be appended to that list. Word has no way of
1061 recognizing nested lists and just uses indents
1064 if (list.prev != null)
1069 if (list.tag == tt.tagUl || list.tag == tt.tagOl)
1071 list.next = node.next;
1073 if (list.next != null)
1074 list.next.prev = list;
1076 child = list.last; /* <li> */
1078 node.parent = child;
1080 node.prev = child.last;
1085 cleanNode(lexer, node);
1093 Symptom: the only child of a block-level element is a
1094 presentation element such as B, I or FONT
1096 Action: add style "font-weight: bold" to the block and
1097 strip the <b> element, leaving its children.
1102 <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1107 <p style="font-weight: bold; font-family: Arial; font-size: 6">
1108 Draft Recommended Practice
1111 This code also replaces the align attribute by a style attribute.
1112 However, to avoid CSS problems with Navigator 4, this isn't done
1113 for the elements: caption, tr and table
1115 private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode)
1119 if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0)
1121 if (node.tag != tt.tagTable
1122 && node.tag != tt.tagTr
1123 && node.tag != tt.tagLi)
1125 /* check for align attribute */
1126 if (node.tag != tt.tagCaption)
1127 textAlign(lexer, node);
1129 child = node.content;
1134 /* check child has no peers */
1136 if (child.next != null)
1139 if (child.tag == tt.tagB)
1141 mergeStyles(node, child);
1142 addStyleProperty(node, "font-weight: bold");
1143 stripOnlyChild(node);
1147 if (child.tag == tt.tagI)
1149 mergeStyles(node, child);
1150 addStyleProperty(node, "font-style: italic");
1151 stripOnlyChild(node);
1155 if (child.tag == tt.tagFont)
1157 mergeStyles(node, child);
1158 addFontStyles(node, child.attributes);
1159 stripOnlyChild(node);
1168 /* the only child of table cell or an inline element such as em */
1169 private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode)
1173 if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE|Dict.CM_ROW)) != 0)
1175 child = node.content;
1180 /* check child has no peers */
1182 if (child.next != null)
1185 if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis)
1187 mergeStyles(node, child);
1188 addStyleProperty(node, "font-weight: bold");
1189 stripOnlyChild(node);
1193 if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis)
1195 mergeStyles(node, child);
1196 addStyleProperty(node, "font-style: italic");
1197 stripOnlyChild(node);
1201 if (child.tag == tt.tagFont)
1203 mergeStyles(node, child);
1204 addFontStyles(node, child.attributes);
1205 stripOnlyChild(node);
1214 Replace font elements by span elements, deleting
1215 the font element's attributes and replacing them
1216 by a single style attribute.
1218 private boolean font2Span(Lexer lexer, Node node, MutableObject pnode)
1220 AttVal av, style, next;
1222 if (node.tag == tt.tagFont)
1224 if (lexer.configuration.DropFontTags)
1226 discardContainer(node, pnode);
1230 /* if FONT is only child of parent element then leave alone */
1231 if (node.parent.content == node
1232 && node.next == null)
1235 addFontStyles(node, node.attributes);
1237 /* extract style attribute and free the rest */
1238 av = node.attributes;
1245 if (av.attribute.equals("style"))
1254 node.attributes = style;
1256 node.tag = tt.tagSpan;
1257 node.element = "span";
1266 Applies all matching rules to a node.
1268 private Node cleanNode(Lexer lexer, Node node)
1271 MutableObject o = new MutableObject();
1274 for (next = node; node.isElement(); node = next)
1278 b = dir2Div(lexer, node, o);
1279 next = (Node)o.getObject();
1283 b = nestedList(lexer, node, o);
1284 next = (Node)o.getObject();
1288 b = center2Div(lexer, node, o);
1289 next = (Node)o.getObject();
1293 b = mergeDivs(lexer, node, o);
1294 next = (Node)o.getObject();
1298 b = blockStyle(lexer, node, o);
1299 next = (Node)o.getObject();
1303 b = inlineStyle(lexer, node, o);
1304 next = (Node)o.getObject();
1308 b = font2Span(lexer, node, o);
1309 next = (Node)o.getObject();
1319 private Node createStyleProperties(Lexer lexer, Node node)
1323 if (node.content != null)
1325 for (child = node.content; child != null; child = child.next)
1327 child = createStyleProperties(lexer, child);
1331 return cleanNode(lexer, node);
1334 private void defineStyleRules(Lexer lexer, Node node)
1338 if (node.content != null)
1340 for (child = node.content;
1341 child != null; child = child.next)
1343 defineStyleRules(lexer, child);
1347 style2Rule(lexer, node);
1350 public void cleanTree(Lexer lexer, Node doc)
1352 doc = createStyleProperties(lexer, doc);
1354 if (!lexer.configuration.MakeClean)
1356 defineStyleRules(lexer, doc);
1357 createStyleElement(lexer, doc);
1361 /* simplifies <b><b> ... </b> ...</b> etc. */
1362 public void nestedEmphasis(Node node)
1364 MutableObject o = new MutableObject();
1367 while (node != null)
1371 if ((node.tag == tt.tagB || node.tag == tt.tagI)
1372 && node.parent != null && node.parent.tag == node.tag)
1374 /* strip redundant inner element */
1376 discardContainer(node, o);
1377 next = (Node)o.getObject();
1382 if (node.content != null)
1383 nestedEmphasis(node.content);
1389 /* replace i by em and b by strong */
1390 public void emFromI(Node node)
1392 while (node != null)
1394 if (node.tag == tt.tagI)
1396 node.element = tt.tagEm.name;
1397 node.tag = tt.tagEm;
1399 else if (node.tag == tt.tagB)
1401 node.element = tt.tagStrong.name;
1402 node.tag = tt.tagStrong;
1405 if (node.content != null)
1406 emFromI(node.content);
1413 Some people use dir or ul without an li
1414 to indent the content. The pattern to
1415 look for is a list with a single implicit
1416 li. This is recursively replaced by an
1417 implicit blockquote.
1419 public void list2BQ(Node node)
1421 while (node != null)
1423 if (node.content != null)
1424 list2BQ(node.content);
1426 if (node.tag != null && node.tag.parser == ParserImpl.getParseList() &&
1427 node.hasOneChild() && node.content.implicit)
1429 stripOnlyChild(node);
1430 node.element = tt.tagBlockquote.name;
1431 node.tag = tt.tagBlockquote;
1432 node.implicit = true;
1440 Replace implicit blockquote by div with an indent
1441 taking care to reduce nested blockquotes to a single
1442 div with the indent set to match the nesting depth
1444 public void bQ2Div(Node node)
1449 while (node != null)
1451 if (node.tag == tt.tagBlockquote && node.implicit)
1455 while(node.hasOneChild() &&
1456 node.content.tag == tt.tagBlockquote &&
1460 stripOnlyChild(node);
1463 if (node.content != null)
1464 bQ2Div(node.content);
1466 indent_buf = "margin-left: " +
1467 (new Integer(2*indent)).toString() + "em";
1469 node.element = tt.tagDiv.name;
1470 node.tag = tt.tagDiv;
1471 node.addAttribute("style", indent_buf);
1473 else if (node.content != null)
1474 bQ2Div(node.content);
1481 /* node is <![if ...]> prune up to <![endif]> */
1482 public Node pruneSection(Lexer lexer, Node node)
1486 /* discard node and returns next */
1487 node = Node.discardElement(node);
1492 if (node.type == Node.SectionTag)
1494 if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
1496 node = pruneSection(lexer, node);
1500 if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif"))
1502 node = Node.discardElement(node);
1511 public void dropSections(Lexer lexer, Node node)
1513 while (node != null)
1515 if (node.type == Node.SectionTag)
1517 /* prune up to matching endif */
1518 if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
1520 node = pruneSection(lexer, node);
1524 /* discard others as well */
1525 node = Node.discardElement(node);
1529 if (node.content != null)
1530 dropSections(lexer, node.content);
1536 public void purgeAttributes(Node node)
1538 AttVal attr = node.attributes;
1542 while (attr != null)
1546 /* special check for class="Code" denoting pre text */
1547 if (attr.attribute != null &&
1548 attr.value != null &&
1549 attr.attribute.equals("class") &&
1550 attr.value.equals("Code"))
1554 else if (attr.attribute != null &&
1555 (attr.attribute.equals("class") ||
1556 attr.attribute.equals("style") ||
1557 attr.attribute.equals("lang") ||
1558 attr.attribute.startsWith("x:") ||
1559 ((attr.attribute.equals("height") || attr.attribute.equals("width")) &&
1560 (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh))))
1565 node.attributes = next;
1575 /* Word2000 uses span excessively, so we strip span out */
1576 public Node stripSpan(Lexer lexer, Node span)
1583 deal with span elements that have content
1584 by splicing the content in place of the span
1585 after having processed it
1588 cleanWord2000(lexer, span.content);
1589 content = span.content;
1591 if (span.prev != null)
1593 else if (content != null)
1596 content = content.next;
1597 Node.removeNode(node);
1598 Node.insertNodeBeforeElement(span, node);
1602 while (content != null)
1605 content = content.next;
1606 Node.removeNode(node);
1607 Node.insertNodeAfterElement(prev, node);
1611 if (span.next == null)
1612 span.parent.last = prev;
1615 span.content = null;
1616 Node.discardElement(span);
1620 /* map non-breaking spaces to regular spaces */
1621 private void normalizeSpaces(Lexer lexer, Node node)
1623 while (node != null)
1625 if (node.content != null)
1626 normalizeSpaces(lexer, node.content);
1628 if (node.type == Node.TextNode)
1631 MutableInteger c = new MutableInteger();
1634 for (i = node.start; i < node.end; ++i)
1636 c.value = (int)node.textarray[i];
1638 /* look for UTF-8 multibyte character */
1640 i += PPrint.getUTF8(node.textarray, i, c);
1645 p = PPrint.putUTF8(node.textarray, p, c.value);
1654 This is a major clean up to strip out all the extra stuff you get
1655 when you save as web page from Word 2000. It doesn't yet know what
1656 to do with VML tags, but these will appear as errors unless you
1657 declare them as new tags, such as o:p which needs to be declared
1660 public void cleanWord2000(Lexer lexer, Node node)
1662 /* used to a list from a sequence of bulletted p's */
1665 while (node != null)
1667 /* discard Word's style verbiage */
1668 if (node.tag == tt.tagStyle ||
1669 node.tag == tt.tagMeta ||
1670 node.type == Node.CommentTag)
1672 node = Node.discardElement(node);
1676 /* strip out all span tags Word scatters so liberally! */
1677 if (node.tag == tt.tagSpan)
1679 node = stripSpan(lexer, node);
1683 /* get rid of Word's xmlns attributes */
1684 if (node.tag == tt.tagHtml)
1686 /* check that it's a Word 2000 document */
1687 if (node.getAttrByName("xmlns:o") == null)
1691 if (node.tag == tt.tagLink)
1693 AttVal attr = node.getAttrByName("rel");
1695 if (attr != null && attr.value != null &&
1696 attr.value.equals("File-List"))
1698 node = Node.discardElement(node);
1703 /* discard empty paragraphs */
1704 if (node.content == null && node.tag == tt.tagP)
1706 node = Node.discardElement(node);
1710 if (node.tag == tt.tagP)
1712 AttVal attr = node.getAttrByName("class");
1714 /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
1715 if (attr != null && attr.value != null &&
1716 attr.value.equals("MsoListBullet"))
1718 Node.coerceNode(lexer, node, tt.tagLi);
1720 if (list == null || list.tag != tt.tagUl)
1722 list = lexer.inferredTag("ul");
1723 Node.insertNodeBeforeElement(node, list);
1726 purgeAttributes(node);
1728 if (node.content != null)
1729 cleanWord2000(lexer, node.content);
1731 /* remove node and append to contents of list */
1732 Node.removeNode(node);
1733 Node.insertNodeAtEnd(list, node);
1736 /* map sequence of <p class="Code"> to <pre>...</pre> */
1737 else if (attr != null && attr.value != null &&
1738 attr.value.equals("Code"))
1740 Node br = lexer.newLineNode();
1741 normalizeSpaces(lexer, node);
1743 if (list == null || list.tag != tt.tagPre)
1745 list = lexer.inferredTag("pre");
1746 Node.insertNodeBeforeElement(node, list);
1749 /* remove node and append to contents of list */
1750 Node.removeNode(node);
1751 Node.insertNodeAtEnd(list, node);
1752 stripSpan(lexer, node);
1753 Node.insertNodeAtEnd(list, br);
1762 /* strip out style and class attributes */
1763 if (node.type == Node.StartTag || node.type == Node.StartEndTag)
1764 purgeAttributes(node);
1766 if (node.content != null)
1767 cleanWord2000(lexer, node.content);
1773 public boolean isWord2000(Node root, TagTable tt)
1775 Node html = root.findHTML(tt);
1777 return (html != null && html.getAttrByName("xmlns:o") != null);