net.sourceforge.phpeclipse/src/org/w3c/tidy/Clean.java

   1 /*
   2  * @(#)Clean.java   1.11 2000/08/16
   3  *
   4  */
   5
   6 package org.w3c.tidy;
   7
   8 /**
   9  *
  10  * Clean up misuse of presentation markup
  11  *
  12  * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
  13  * See Tidy.java for the copyright notice.
  14  * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
  15  * HTML Tidy Release 4 Aug 2000</a>
  16  *
  17  * @author  Dave Raggett <dsr@w3.org>
  18  * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
  19  * @version 1.0, 1999/05/22
  20  * @version 1.0.1, 1999/05/29
  21  * @version 1.1, 1999/06/18 Java Bean
  22  * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
  23  * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
  24  * @version 1.4, 1999/09/04 DOM support
  25  * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
  26  * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
  27  * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
  28  * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
  29  * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
  30  * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
  31  * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
  32  */
  33
  34 /*
  35   Filters from other formats such as Microsoft Word
  36   often make excessive use of presentation markup such
  37   as font tags, B, I, and the align attribute. By applying
  38   a set of production rules, it is straight forward to
  39   transform this to use CSS.
  40
  41   Some rules replace some of the children of an element by
  42   style properties on the element, e.g.
  43
  44   <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
  45
  46   Such rules are applied to the element's content and then
  47   to the element itself until none of the rules more apply.
  48   Having applied all the rules to an element, it will have
  49   a style attribute with one or more properties.
  50
  51   Other rules strip the element they apply to, replacing
  52   it by style properties on the contents, e.g.
  53
  54   <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
  55
  56   These rules are applied to an element before processing
  57   its content and replace the current element by the first
  58   element in the exposed content.
  59
  60   After applying both sets of rules, you can replace the
  61   style attribute by a class value and style rule in the
  62   document head. To support this, an association of styles
  63   and class names is built.
  64
  65   A naive approach is to rely on string matching to test
  66   when two property lists are the same. A better approach
  67   would be to first sort the properties before matching.
  68 */
  69
  70 public class Clean {
  71
  72     private int classNum = 1;
  73
  74     private TagTable tt;
  75
  76     public Clean(TagTable tt)
  77     {
  78       this.tt = tt;
  79     }
  80
  81     private StyleProp insertProperty(StyleProp props, String name,
  82                                             String value)
  83     {
  84         StyleProp first, prev, prop;
  85         int cmp;
  86
  87         prev = null;
  88         first = props;
  89
  90         while (props != null)
  91         {
  92             cmp = props.name.compareTo(name);
  93
  94             if (cmp == 0)
  95             {
  96                 /* this property is already defined, ignore new value */
  97                 return first;
  98             }
  99
 100             if (cmp > 0) // props.name > name
 101             {
 102                 /* insert before this */
 103
 104                 prop = new StyleProp(name, value, props);
 105
 106                 if (prev != null)
 107                     prev.next = prop;
 108                 else
 109                     first = prop;
 110
 111                 return first;
 112             }
 113
 114             prev = props;
 115             props = props.next;
 116         }
 117
 118         prop = new StyleProp(name, value);
 119
 120         if (prev != null)
 121             prev.next = prop;
 122         else
 123             first = prop;
 124
 125         return first;
 126     }
 127
 128     /*
 129      Create sorted linked list of properties from style string
 130      It temporarily places nulls in place of ':' and ';' to
 131      delimit the strings for the property name and value.
 132      Some systems don't allow you to null literal strings,
 133      so to avoid this, a copy is made first.
 134     */
 135     private StyleProp createProps(StyleProp prop, String style)
 136     {
 137         int name_end;
 138         int value_end;
 139         int value_start = 0;
 140         int name_start = 0;
 141         boolean more;
 142
 143         name_start = 0;
 144         while (name_start < style.length())
 145         {
 146             while (name_start < style.length() &&
 147                        style.charAt(name_start) == ' ')
 148                 ++name_start;
 149
 150             name_end = name_start;
 151
 152             while (name_end < style.length())
 153             {
 154                 if (style.charAt(name_end) == ':')
 155                 {
 156                     value_start = name_end + 1;
 157                     break;
 158                 }
 159
 160                 ++name_end;
 161             }
 162
 163             if (name_end >= style.length() || style.charAt(name_end) != ':')
 164                 break;
 165
 166             while (value_start < style.length() &&
 167                        style.charAt(value_start) == ' ')
 168                 ++value_start;
 169
 170             value_end = value_start;
 171             more = false;
 172
 173             while (value_end < style.length())
 174             {
 175                 if (style.charAt(value_end) == ';')
 176                 {
 177                     more = true;
 178                     break;
 179                 }
 180
 181                 ++value_end;
 182             }
 183
 184             prop = insertProperty(prop,
 185                                   style.substring(name_start, name_end),
 186                                   style.substring(value_start, value_end));
 187
 188             if (more)
 189             {
 190                 name_start = value_end + 1;
 191                 continue;
 192             }
 193
 194             break;
 195         }
 196
 197         return prop;
 198     }
 199
 200     private String createPropString(StyleProp props)
 201     {
 202         String style = "";
 203         int len;
 204         StyleProp prop;
 205
 206         /* compute length */
 207
 208         for (len = 0, prop = props; prop != null; prop = prop.next)
 209         {
 210             len += prop.name.length() + 2;
 211             len += prop.value.length() + 2;
 212         }
 213
 214         for (prop = props; prop != null; prop = prop.next)
 215         {
 216             style = style.concat(prop.name);
 217             style = style.concat(": ");
 218
 219             style = style.concat(prop.value);
 220
 221             if (prop.next == null)
 222                 break;
 223
 224             style = style.concat("; ");
 225         }
 226
 227         return style;
 228     }
 229
 230     /*
 231       create string with merged properties
 232     */
 233     private String addProperty(String style, String property)
 234     {
 235         StyleProp prop;
 236
 237         prop = createProps(null, style);
 238         prop = createProps(prop, property);
 239         style = createPropString(prop);
 240         return style;
 241     }
 242
 243     private String gensymClass(String tag)
 244     {
 245         String str;
 246
 247         str = "c" + classNum;
 248         classNum++;
 249         return str;
 250     }
 251
 252     private String findStyle(Lexer lexer, String tag, String properties)
 253     {
 254         Style style;
 255
 256         for (style = lexer.styles; style != null; style=style.next)
 257         {
 258             if (style.tag.equals(tag) &&
 259                 style.properties.equals(properties))
 260                 return style.tagClass;
 261         }
 262
 263         style = new Style(tag, gensymClass(tag), properties, lexer.styles);
 264         lexer.styles = style;
 265         return style.tagClass;
 266     }
 267
 268     /*
 269      Find style attribute in node, and replace it
 270      by corresponding class attribute. Search for
 271      class in style dictionary otherwise gensym
 272      new class and add to dictionary.
 273
 274      Assumes that node doesn't have a class attribute
 275     */
 276     private void style2Rule(Lexer lexer, Node node)
 277     {
 278         AttVal styleattr, classattr;
 279         String classname;
 280
 281         styleattr = node.getAttrByName("style");
 282
 283         if (styleattr != null)
 284         {
 285                 classname = findStyle(lexer, node.element, styleattr.value);
 286                 classattr = node.getAttrByName("class");
 287
 288                 /*
 289                  if there already is a class attribute
 290                  then append class name after a space
 291                 */
 292                 if (classattr != null)
 293                 {
 294                         classattr.value = classattr.value + " " + classname;
 295                         node.removeAttribute(styleattr);
 296                 }
 297                 else /* reuse style attribute for class attribute */
 298                 {
 299                         styleattr.attribute = "class";
 300                         styleattr.value = classname;
 301                 }
 302         }
 303     }
 304
 305     private void addColorRule(Lexer lexer, String selector, String color)
 306     {
 307         if (color != null)
 308         {
 309             lexer.addStringLiteral(selector);
 310             lexer.addStringLiteral(" { color: ");
 311             lexer.addStringLiteral(color);
 312             lexer.addStringLiteral(" }\n");
 313         }
 314     }
 315
 316     /*
 317      move presentation attribs from body to style element
 318
 319      background="foo" ->  body { background-image: url(foo) }
 320      bgcolor="foo"    ->  body { background-color: foo }
 321      text="foo"       ->  body { color: foo }
 322      link="foo"       ->  :link { color: foo }
 323      vlink="foo"      ->  :visited { color: foo }
 324      alink="foo"      ->  :active { color: foo }
 325     */
 326     private void cleanBodyAttrs(Lexer lexer, Node body)
 327     {
 328         AttVal attr;
 329         String bgurl = null;
 330         String bgcolor = null;
 331         String color = null;
 332
 333         attr = body.getAttrByName("background");
 334
 335         if (attr != null)
 336         {
 337             bgurl = attr.value;
 338             attr.value = null;
 339             body.removeAttribute(attr);
 340         }
 341
 342         attr = body.getAttrByName("bgcolor");
 343
 344         if (attr != null)
 345         {
 346             bgcolor = attr.value;
 347             attr.value = null;
 348             body.removeAttribute(attr);
 349         }
 350
 351         attr = body.getAttrByName("text");
 352
 353         if (attr != null)
 354         {
 355             color = attr.value;
 356             attr.value = null;
 357             body.removeAttribute(attr);
 358         }
 359
 360         if (bgurl != null || bgcolor != null || color != null)
 361         {
 362             lexer.addStringLiteral(" body {\n");
 363
 364             if (bgurl != null)
 365             {
 366                 lexer.addStringLiteral("  background-image: url(");
 367                 lexer.addStringLiteral(bgurl);
 368                 lexer.addStringLiteral(");\n");
 369             }
 370
 371             if (bgcolor != null)
 372             {
 373                 lexer.addStringLiteral("  background-color: ");
 374                 lexer.addStringLiteral(bgcolor);
 375                 lexer.addStringLiteral(";\n");
 376             }
 377
 378             if (color != null)
 379             {
 380                 lexer.addStringLiteral("  color: ");
 381                 lexer.addStringLiteral(color);
 382                 lexer.addStringLiteral(";\n");
 383             }
 384
 385             lexer.addStringLiteral(" }\n");
 386         }
 387
 388         attr = body.getAttrByName("link");
 389
 390         if (attr != null)
 391         {
 392             addColorRule(lexer, " :link", attr.value);
 393             body.removeAttribute(attr);
 394         }
 395
 396         attr = body.getAttrByName("vlink");
 397
 398         if (attr != null)
 399         {
 400             addColorRule(lexer, " :visited", attr.value);
 401             body.removeAttribute(attr);
 402         }
 403
 404         attr = body.getAttrByName("alink");
 405
 406         if (attr != null)
 407         {
 408             addColorRule(lexer, " :active", attr.value);
 409             body.removeAttribute(attr);
 410         }
 411     }
 412
 413     private boolean niceBody(Lexer lexer, Node doc)
 414     {
 415         Node body = doc.findBody(lexer.configuration.tt);
 416
 417         if (body != null)
 418         {
 419             if (
 420                 body.getAttrByName("background") != null ||
 421                 body.getAttrByName("bgcolor") != null ||
 422                 body.getAttrByName("text") != null ||
 423                 body.getAttrByName("link") != null ||
 424                 body.getAttrByName("vlink") != null ||
 425                 body.getAttrByName("alink") != null
 426                )
 427             {
 428                 lexer.badLayout |= Report.USING_BODY;
 429                 return false;
 430             }
 431         }
 432
 433         return true;
 434     }
 435
 436     /* create style element using rules from dictionary */
 437     private void createStyleElement(Lexer lexer, Node doc)
 438     {
 439         Node node, head, body;
 440         Style style;
 441         AttVal av;
 442
 443         if (lexer.styles == null && niceBody(lexer, doc))
 444             return;
 445
 446         node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
 447         node.implicit = true;
 448
 449         /* insert type attribute */
 450         av = new AttVal(null, null, '"', "type", "text/css");
 451         av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
 452         node.attributes = av;
 453
 454         body = doc.findBody(lexer.configuration.tt);
 455
 456         lexer.txtstart = lexer.lexsize;
 457
 458         if (body != null)
 459             cleanBodyAttrs(lexer, body);
 460
 461         for (style = lexer.styles; style != null; style = style.next)
 462         {
 463             lexer.addCharToLexer(' ');
 464             lexer.addStringLiteral(style.tag);
 465             lexer.addCharToLexer('.');
 466             lexer.addStringLiteral(style.tagClass);
 467             lexer.addCharToLexer(' ');
 468             lexer.addCharToLexer('{');
 469             lexer.addStringLiteral(style.properties);
 470             lexer.addCharToLexer('}');
 471             lexer.addCharToLexer('\n');
 472         }
 473
 474         lexer.txtend = lexer.lexsize;
 475
 476         Node.insertNodeAtEnd(node,
 477                              lexer.newNode(Node.TextNode,
 478                                       lexer.lexbuf,
 479                                       lexer.txtstart,
 480                                       lexer.txtend));
 481
 482         /*
 483          now insert style element into document head
 484
 485          doc is root node. search its children for html node
 486          the head node should be first child of html node
 487         */
 488
 489         head = doc.findHEAD(lexer.configuration.tt);
 490
 491         if (head != null)
 492             Node.insertNodeAtEnd(head, node);
 493     }
 494
 495     /* ensure bidirectional links are consistent */
 496     private void fixNodeLinks(Node node)
 497     {
 498         Node child;
 499
 500         if (node.prev != null)
 501             node.prev.next = node;
 502         else
 503             node.parent.content = node;
 504
 505         if (node.next != null)
 506             node.next.prev = node;
 507         else
 508             node.parent.last = node;
 509
 510         for (child = node.content; child != null; child = child.next)
 511             child.parent = node;
 512     }
 513
 514     /*
 515      used to strip child of node when
 516      the node has one and only one child
 517     */
 518     private void stripOnlyChild(Node node)
 519     {
 520         Node child;
 521
 522         child = node.content;
 523         node.content = child.content;
 524         node.last = child.last;
 525         child.content = null;
 526
 527         for (child = node.content; child != null; child = child.next)
 528             child.parent = node;
 529     }
 530
 531     /* used to strip font start and end tags */
 532     private void discardContainer(Node element, MutableObject pnode)
 533     {
 534         Node node;
 535         Node parent = element.parent;
 536
 537         if (element.content != null)
 538         {
 539             element.last.next = element.next;
 540
 541             if (element.next != null)
 542             {
 543                 element.next.prev = element.last;
 544                 element.last.next = element.next;
 545             }
 546             else
 547                 parent.last = element.last;
 548
 549             if (element.prev != null)
 550             {
 551                 element.content.prev = element.prev;
 552                 element.prev.next = element.content;
 553             }
 554             else
 555                 parent.content = element.content;
 556
 557             for (node = element.content; node != null; node = node.next)
 558                 node.parent = parent;
 559
 560             pnode.setObject(element.content);
 561         }
 562         else
 563         {
 564             if (element.next != null)
 565                 element.next.prev = element.prev;
 566             else
 567                 parent.last = element.prev;
 568
 569             if (element.prev != null)
 570                 element.prev.next = element.next;
 571             else
 572                 parent.content = element.next;
 573
 574             pnode.setObject(element.next);
 575         }
 576
 577         element.next = null;
 578         element.content = null;
 579     }
 580
 581     /*
 582      Add style property to element, creating style
 583      attribute as needed and adding ; delimiter
 584     */
 585     private void addStyleProperty(Node node, String property)
 586     {
 587         AttVal av;
 588
 589         for (av = node.attributes; av != null; av = av.next)
 590         {
 591             if (av.attribute.equals("style"))
 592                 break;
 593         }
 594
 595         /* if style attribute already exists then insert property */
 596
 597         if (av != null)
 598         {
 599             String s;
 600
 601             s = addProperty(av.value, property);
 602             av.value = s;
 603         }
 604         else /* else create new style attribute */
 605         {
 606             av = new AttVal(node.attributes, null, '"', "style", property);
 607             av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
 608             node.attributes = av;
 609         }
 610     }
 611
 612     /*
 613       Create new string that consists of the
 614       combined style properties in s1 and s2
 615
 616       To merge property lists, we build a linked
 617       list of property/values and insert properties
 618       into the list in order, merging values for
 619       the same property name.
 620     */
 621     private String mergeProperties(String s1, String s2)
 622     {
 623         String s;
 624         StyleProp prop;
 625
 626         prop = createProps(null, s1);
 627         prop = createProps(prop, s2);
 628         s = createPropString(prop);
 629         return s;
 630     }
 631
 632     private void mergeStyles(Node node, Node child)
 633     {
 634         AttVal av;
 635         String s1, s2, style;
 636
 637         for (s2 = null, av = child.attributes; av != null; av = av.next)
 638         {
 639             if (av.attribute.equals("style"))
 640             {
 641                 s2 = av.value;
 642                 break;
 643             }
 644         }
 645
 646         for (s1 = null, av = node.attributes; av != null; av = av.next)
 647         {
 648             if (av.attribute.equals("style"))
 649             {
 650                 s1 = av.value;
 651                 break;
 652             }
 653         }
 654
 655         if (s1 != null)
 656         {
 657             if (s2 != null)  /* merge styles from both */
 658             {
 659                 style = mergeProperties(s1, s2);
 660                 av.value = style;
 661             }
 662         }
 663         else if (s2 != null)  /* copy style of child */
 664         {
 665             av = new AttVal(node.attributes, null, '"', "style", s2);
 666             av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
 667             node.attributes = av;
 668         }
 669     }
 670
 671     private String fontSize2Name(String size)
 672     {
 673         /*
 674         String[] sizes =
 675         {
 676             "50%",
 677             "60%",
 678             "80%",
 679             null,
 680             "120%",
 681             "150%",
 682             "200%"
 683         };
 684         */
 685
 686         String[] sizes =
 687         {
 688             "60%",
 689             "70%",
 690             "80%",
 691             null,
 692             "120%",
 693             "150%",
 694             "200%"
 695         };
 696         String buf;
 697
 698         if (size.length() > 0 &&
 699             '0' <= size.charAt(0) && size.charAt(0) <= '6')
 700         {
 701             int n = size.charAt(0) - '0';
 702             return sizes[n];
 703         }
 704
 705         if (size.length() > 0 && size.charAt(0) == '-')
 706         {
 707             if (size.length() > 1 &&
 708                 '0' <= size.charAt(1) && size.charAt(1) <= '6')
 709             {
 710                 int n = size.charAt(1) - '0';
 711                 double x;
 712
 713                 for (x = 1.0; n > 0; --n)
 714                     x *= 0.8;
 715
 716                 x *= 100.0;
 717                 buf = "" + (int)x + "%";
 718
 719                 return buf;
 720             }
 721
 722             return "smaller"; /*"70%"; */
 723         }
 724
 725         if (size.length() > 1 &&
 726             '0' <= size.charAt(1) && size.charAt(1) <= '6')
 727         {
 728             int n = size.charAt(1) - '0';
 729             double x;
 730
 731             for (x = 1.0; n > 0; --n)
 732                 x *= 1.2;
 733
 734             x *= 100.0;
 735             buf = "" + (int)x + "%";
 736
 737             return buf;
 738         }
 739
 740         return "larger"; /* "140%" */
 741     }
 742
 743     private void addFontFace(Node node, String face)
 744     {
 745         addStyleProperty(node, "font-family: " + face);
 746     }
 747
 748     private void addFontSize(Node node, String size)
 749     {
 750         String value;
 751
 752         if (size.equals("6") && node.tag == tt.tagP)
 753         {
 754             node.element = "h1";
 755             tt.findTag(node);
 756             return;
 757         }
 758
 759         if (size.equals("5") && node.tag == tt.tagP)
 760         {
 761             node.element = "h2";
 762             tt.findTag(node);
 763             return;
 764         }
 765
 766         if (size.equals("4") && node.tag == tt.tagP)
 767         {
 768             node.element = "h3";
 769             tt.findTag(node);
 770             return;
 771         }
 772
 773         value = fontSize2Name(size);
 774
 775         if (value != null)
 776         {
 777             addStyleProperty(node, "font-size: " + value);
 778         }
 779     }
 780
 781     private void addFontColor(Node node, String color)
 782     {
 783         addStyleProperty(node, "color: " + color);
 784     }
 785
 786     private void addAlign(Node node, String align)
 787     {
 788         /* force alignment value to lower case */
 789         addStyleProperty(node, "text-align: " + align.toLowerCase());
 790     }
 791
 792     /*
 793      add style properties to node corresponding to
 794      the font face, size and color attributes
 795     */
 796     private void addFontStyles(Node node, AttVal av)
 797     {
 798         while (av != null)
 799         {
 800             if (av.attribute.equals("face"))
 801                 addFontFace(node, av.value);
 802             else if (av.attribute.equals("size"))
 803                 addFontSize(node, av.value);
 804             else if (av.attribute.equals("color"))
 805                 addFontColor(node, av.value);
 806
 807             av = av.next;
 808         }
 809     }
 810
 811     /*
 812         Symptom: <p align=center>
 813         Action: <p style="text-align: center">
 814     */
 815     private void textAlign(Lexer lexer, Node node)
 816     {
 817         AttVal av, prev;
 818
 819         prev = null;
 820
 821         for (av = node.attributes; av != null; av = av.next)
 822         {
 823             if (av.attribute.equals("align"))
 824             {
 825                 if (prev != null)
 826                     prev.next = av.next;
 827                 else
 828                     node.attributes = av.next;
 829
 830                 if (av.value != null)
 831                 {
 832                     addAlign(node, av.value);
 833                 }
 834
 835                 break;
 836             }
 837
 838             prev = av;
 839         }
 840     }
 841
 842     /*
 843        The clean up rules use the pnode argument to return the
 844        next node when the orignal node has been deleted
 845     */
 846
 847     /*
 848         Symptom: <dir> <li> where <li> is only child
 849         Action: coerce <dir> <li> to <div> with indent.
 850     */
 851
 852     private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode)
 853     {
 854         Node child;
 855
 856         if (node.tag == tt.tagDir ||
 857             node.tag == tt.tagUl ||
 858             node.tag == tt.tagOl)
 859         {
 860             child = node.content;
 861
 862             if (child == null)
 863                 return false;
 864
 865             /* check child has no peers */
 866
 867             if (child.next != null)
 868                 return false;
 869
 870             if (child.tag != tt.tagLi)
 871                 return false;
 872
 873             if (!child.implicit)
 874                 return false;
 875
 876             /* coerce dir to div */
 877
 878             node.tag = tt.tagDiv;
 879             node.element = "div";
 880             addStyleProperty(node, "margin-left: 2em");
 881             stripOnlyChild(node);
 882             return true;
 883
 884 //#if 0
 885             //Node content;
 886             //Node last;
 887             //content = child.content;
 888             //last = child.last;
 889             //child.content = null;
 890
 891             /* adjust parent and set margin on contents of <li> */
 892
 893             //for (child = content; child != null; child = child.next)
 894             //{
 895             //    child.parent = node.parent;
 896             //    addStyleProperty(child, "margin-left: 1em");
 897             //}
 898
 899             /* hook first/last into sequence */
 900
 901             //if (content != null)
 902             //{
 903             //    content.prev = node.prev;
 904             //    last.next = node.next;
 905             //    fixNodeLinks(content);
 906             //    fixNodeLinks(last);
 907             //}
 908
 909             //node.next = null;
 910
 911             /* ensure that new node is cleaned */
 912             //pnode.setObject(cleanNode(lexer, content));
 913             //return true;
 914 //#endif
 915         }
 916
 917         return false;
 918     }
 919
 920     /*
 921         Symptom: <center>
 922         Action: replace <center> by <div style="text-align: center">
 923     */
 924
 925     private boolean center2Div(Lexer lexer, Node node, MutableObject pnode)
 926     {
 927         if (node.tag == tt.tagCenter)
 928         {
 929             if (lexer.configuration.DropFontTags)
 930             {
 931                 if (node.content != null)
 932                 {
 933                     Node last = node.last;
 934                     Node parent = node.parent;
 935
 936                     discardContainer(node, pnode);
 937
 938                     node = lexer.inferredTag("br");
 939
 940                     if (last.next != null)
 941                         last.next.prev = node;
 942
 943                     node.next = last.next;
 944                     last.next = node;
 945                     node.prev = last;
 946
 947                     if (parent.last == last)
 948                         parent.last = node;
 949
 950                     node.parent = parent;
 951                 }
 952                 else
 953                 {
 954                     Node prev = node.prev;
 955                     Node next = node.next;
 956                     Node parent = node.parent;
 957                     discardContainer(node, pnode);
 958
 959                     node = lexer.inferredTag("br");
 960                     node.next = next;
 961                     node.prev = prev;
 962                     node.parent = parent;
 963
 964                     if (next != null)
 965                         next.prev = node;
 966                     else
 967                         parent.last = node;
 968
 969                     if (prev != null)
 970                         prev.next = node;
 971                     else
 972                         parent.content = node;
 973                 }
 974
 975                 return true;
 976             }
 977             node.tag = tt.tagDiv;
 978             node.element = "div";
 979             addStyleProperty(node, "text-align: center");
 980             return true;
 981         }
 982
 983         return false;
 984     }
 985
 986     /*
 987         Symptom <div><div>...</div></div>
 988         Action: merge the two divs
 989
 990       This is useful after nested <dir>s used by Word
 991       for indenting have been converted to <div>s
 992     */
 993     private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode)
 994     {
 995         Node child;
 996
 997         if (node.tag != tt.tagDiv)
 998             return false;
 999
1000         child = node.content;
1001
1002         if (child == null)
1003             return false;
1004
1005         if (child.tag != tt.tagDiv)
1006             return false;
1007
1008         if (child.next != null)
1009             return false;
1010
1011         mergeStyles(node, child);
1012         stripOnlyChild(node);
1013         return true;
1014     }
1015
1016     /*
1017         Symptom: <ul><li><ul>...</ul></li></ul>
1018         Action: discard outer list
1019     */
1020
1021     private boolean nestedList(Lexer lexer, Node node, MutableObject pnode)
1022     {
1023         Node child, list;
1024
1025         if (node.tag == tt.tagUl || node.tag == tt.tagOl)
1026         {
1027             child = node.content;
1028
1029             if (child == null)
1030                 return false;
1031
1032             /* check child has no peers */
1033
1034             if (child.next != null)
1035                 return false;
1036
1037             list = child.content;
1038
1039             if (list == null)
1040                 return false;
1041
1042             if (list.tag != node.tag)
1043                 return false;
1044
1045             pnode.setObject(node.next);
1046
1047             /* move inner list node into position of outer node */
1048             list.prev = node.prev;
1049             list.next = node.next;
1050             list.parent = node.parent;
1051             fixNodeLinks(list);
1052
1053             /* get rid of outer ul and its li */
1054             child.content = null;
1055             node.content = null;
1056             node.next = null;
1057
1058             /*
1059               If prev node was a list the chances are this node
1060               should be appended to that list. Word has no way of
1061               recognizing nested lists and just uses indents
1062             */
1063
1064             if (list.prev != null)
1065             {
1066                 node = list;
1067                 list = node.prev;
1068
1069                 if (list.tag == tt.tagUl || list.tag == tt.tagOl)
1070                 {
1071                     list.next = node.next;
1072
1073                     if (list.next != null)
1074                         list.next.prev = list;
1075
1076                     child = list.last;  /* <li> */
1077
1078                     node.parent = child;
1079                     node.next = null;
1080                     node.prev = child.last;
1081                     fixNodeLinks(node);
1082                 }
1083             }
1084
1085             cleanNode(lexer, node);
1086             return true;
1087         }
1088
1089         return false;
1090     }
1091
1092     /*
1093         Symptom: the only child of a block-level element is a
1094         presentation element such as B, I or FONT
1095
1096         Action: add style "font-weight: bold" to the block and
1097         strip the <b> element, leaving its children.
1098
1099       example:
1100
1101         <p>
1102           <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1103         </p>
1104
1105       becomes:
1106
1107           <p style="font-weight: bold; font-family: Arial; font-size: 6">
1108             Draft Recommended Practice
1109           </p>
1110
1111       This code also replaces the align attribute by a style attribute.
1112       However, to avoid CSS problems with Navigator 4, this isn't done
1113       for the elements: caption, tr and table
1114     */
1115     private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode)
1116     {
1117         Node child;
1118
1119         if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0)
1120         {
1121             if (node.tag != tt.tagTable
1122                     && node.tag != tt.tagTr
1123                     && node.tag != tt.tagLi)
1124             {
1125                 /* check for align attribute */
1126                 if (node.tag != tt.tagCaption)
1127                     textAlign(lexer, node);
1128
1129                 child = node.content;
1130
1131                 if (child == null)
1132                     return false;
1133
1134                 /* check child has no peers */
1135
1136                 if (child.next != null)
1137                     return false;
1138
1139                 if (child.tag == tt.tagB)
1140                 {
1141                     mergeStyles(node, child);
1142                     addStyleProperty(node, "font-weight: bold");
1143                     stripOnlyChild(node);
1144                     return true;
1145                 }
1146
1147                 if (child.tag == tt.tagI)
1148                 {
1149                     mergeStyles(node, child);
1150                     addStyleProperty(node, "font-style: italic");
1151                     stripOnlyChild(node);
1152                     return true;
1153                 }
1154
1155                 if (child.tag == tt.tagFont)
1156                 {
1157                     mergeStyles(node, child);
1158                     addFontStyles(node, child.attributes);
1159                     stripOnlyChild(node);
1160                     return true;
1161                 }
1162             }
1163         }
1164
1165         return false;
1166     }
1167
1168     /* the only child of table cell or an inline element such as em */
1169     private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode)
1170     {
1171         Node child;
1172
1173         if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE|Dict.CM_ROW)) != 0)
1174         {
1175             child = node.content;
1176
1177             if (child == null)
1178                 return false;
1179
1180             /* check child has no peers */
1181
1182             if (child.next != null)
1183                 return false;
1184
1185             if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis)
1186             {
1187                 mergeStyles(node, child);
1188                 addStyleProperty(node, "font-weight: bold");
1189                 stripOnlyChild(node);
1190                 return true;
1191             }
1192
1193             if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis)
1194             {
1195                 mergeStyles(node, child);
1196                 addStyleProperty(node, "font-style: italic");
1197                 stripOnlyChild(node);
1198                 return true;
1199             }
1200
1201             if (child.tag == tt.tagFont)
1202             {
1203                 mergeStyles(node, child);
1204                 addFontStyles(node, child.attributes);
1205                 stripOnlyChild(node);
1206                 return true;
1207             }
1208         }
1209
1210         return false;
1211     }
1212
1213     /*
1214       Replace font elements by span elements, deleting
1215       the font element's attributes and replacing them
1216       by a single style attribute.
1217     */
1218     private boolean font2Span(Lexer lexer, Node node, MutableObject pnode)
1219     {
1220         AttVal av, style, next;
1221
1222         if (node.tag == tt.tagFont)
1223         {
1224             if (lexer.configuration.DropFontTags)
1225             {
1226                 discardContainer(node, pnode);
1227                 return false;
1228             }
1229
1230             /* if FONT is only child of parent element then leave alone */
1231             if (node.parent.content == node
1232                 && node.next == null)
1233                 return false;
1234
1235             addFontStyles(node, node.attributes);
1236
1237             /* extract style attribute and free the rest */
1238             av = node.attributes;
1239             style = null;
1240
1241             while (av != null)
1242             {
1243                 next = av.next;
1244
1245                 if (av.attribute.equals("style"))
1246                 {
1247                     av.next = null;
1248                     style = av;
1249                 }
1250
1251                 av = next;
1252             }
1253
1254             node.attributes = style;
1255
1256             node.tag = tt.tagSpan;
1257             node.element = "span";
1258
1259             return true;
1260         }
1261
1262         return false;
1263     }
1264
1265     /*
1266       Applies all matching rules to a node.
1267     */
1268     private Node cleanNode(Lexer lexer, Node node)
1269     {
1270         Node next = null;
1271         MutableObject o = new MutableObject();
1272         boolean b = false;
1273
1274         for (next = node; node.isElement(); node = next)
1275         {
1276             o.setObject(next);
1277
1278             b = dir2Div(lexer, node, o);
1279             next = (Node)o.getObject();
1280             if (b)
1281                 continue;
1282
1283             b = nestedList(lexer, node, o);
1284             next = (Node)o.getObject();
1285             if (b)
1286                 continue;
1287
1288             b = center2Div(lexer, node, o);
1289             next = (Node)o.getObject();
1290             if (b)
1291                 continue;
1292
1293             b = mergeDivs(lexer, node, o);
1294             next = (Node)o.getObject();
1295             if (b)
1296                 continue;
1297
1298             b = blockStyle(lexer, node, o);
1299             next = (Node)o.getObject();
1300             if (b)
1301                 continue;
1302
1303             b = inlineStyle(lexer, node, o);
1304             next = (Node)o.getObject();
1305             if (b)
1306                 continue;
1307
1308             b = font2Span(lexer, node, o);
1309             next = (Node)o.getObject();
1310             if (b)
1311                 continue;
1312
1313             break;
1314         }
1315
1316         return next;
1317     }
1318
1319     private Node createStyleProperties(Lexer lexer, Node node)
1320     {
1321         Node child;
1322
1323         if (node.content != null)
1324         {
1325             for (child = node.content; child != null; child = child.next)
1326             {
1327                 child = createStyleProperties(lexer, child);
1328             }
1329         }
1330
1331         return cleanNode(lexer, node);
1332     }
1333
1334     private void defineStyleRules(Lexer lexer, Node node)
1335     {
1336         Node child;
1337
1338         if (node.content != null)
1339         {
1340             for (child = node.content;
1341                     child != null; child = child.next)
1342             {
1343                 defineStyleRules(lexer, child);
1344             }
1345         }
1346
1347         style2Rule(lexer, node);
1348     }
1349
1350     public void cleanTree(Lexer lexer, Node doc)
1351     {
1352         doc = createStyleProperties(lexer, doc);
1353
1354         if (!lexer.configuration.MakeClean)
1355         {
1356             defineStyleRules(lexer, doc);
1357             createStyleElement(lexer, doc);
1358         }
1359     }
1360
1361     /* simplifies <b><b> ... </b> ...</b> etc. */
1362     public void nestedEmphasis(Node node)
1363     {
1364         MutableObject o = new MutableObject();
1365         Node next;
1366
1367         while (node != null)
1368         {
1369             next = node.next;
1370
1371             if ((node.tag == tt.tagB || node.tag == tt.tagI)
1372                 && node.parent != null && node.parent.tag == node.tag)
1373             {
1374                 /* strip redundant inner element */
1375                 o.setObject(next);
1376                 discardContainer(node, o);
1377                 next = (Node)o.getObject();
1378                 node = next;
1379                 continue;
1380             }
1381
1382             if (node.content != null)
1383                 nestedEmphasis(node.content);
1384
1385             node = next;
1386         }
1387     }
1388
1389     /* replace i by em and b by strong */
1390     public void emFromI(Node node)
1391     {
1392         while (node != null)
1393         {
1394             if (node.tag == tt.tagI)
1395             {
1396                 node.element = tt.tagEm.name;
1397                 node.tag = tt.tagEm;
1398             }
1399             else if (node.tag == tt.tagB)
1400             {
1401                 node.element = tt.tagStrong.name;
1402                 node.tag = tt.tagStrong;
1403             }
1404
1405             if (node.content != null)
1406                 emFromI(node.content);
1407
1408             node = node.next;
1409         }
1410     }
1411
1412     /*
1413      Some people use dir or ul without an li
1414      to indent the content. The pattern to
1415      look for is a list with a single implicit
1416      li. This is recursively replaced by an
1417      implicit blockquote.
1418     */
1419     public void list2BQ(Node node)
1420     {
1421         while (node != null)
1422         {
1423             if (node.content != null)
1424                 list2BQ(node.content);
1425
1426             if (node.tag != null && node.tag.parser == ParserImpl.getParseList() &&
1427                 node.hasOneChild() && node.content.implicit)
1428             {
1429                 stripOnlyChild(node);
1430                 node.element = tt.tagBlockquote.name;
1431                 node.tag = tt.tagBlockquote;
1432                 node.implicit = true;
1433             }
1434
1435             node = node.next;
1436         }
1437     }
1438
1439     /*
1440      Replace implicit blockquote by div with an indent
1441      taking care to reduce nested blockquotes to a single
1442      div with the indent set to match the nesting depth
1443     */
1444     public void bQ2Div(Node node)
1445     {
1446         int indent;
1447         String indent_buf;
1448
1449         while (node != null)
1450         {
1451             if (node.tag == tt.tagBlockquote && node.implicit)
1452             {
1453                 indent = 1;
1454
1455                 while(node.hasOneChild() &&
1456                       node.content.tag == tt.tagBlockquote &&
1457                       node.implicit)
1458                 {
1459                     ++indent;
1460                     stripOnlyChild(node);
1461                 }
1462
1463                 if (node.content != null)
1464                     bQ2Div(node.content);
1465
1466                 indent_buf = "margin-left: " +
1467                              (new Integer(2*indent)).toString() + "em";
1468
1469                 node.element = tt.tagDiv.name;
1470                 node.tag = tt.tagDiv;
1471                 node.addAttribute("style", indent_buf);
1472             }
1473             else if (node.content != null)
1474                 bQ2Div(node.content);
1475
1476
1477             node = node.next;
1478         }
1479     }
1480
1481     /* node is <![if ...]> prune up to <![endif]> */
1482     public Node pruneSection(Lexer lexer, Node node)
1483     {
1484         for (;;)
1485         {
1486             /* discard node and returns next */
1487             node = Node.discardElement(node);
1488
1489             if (node == null)
1490                 return null;
1491
1492             if (node.type == Node.SectionTag)
1493             {
1494                 if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
1495                 {
1496                     node = pruneSection(lexer, node);
1497                     continue;
1498                 }
1499
1500                 if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif"))
1501                 {
1502                     node = Node.discardElement(node);
1503                     break;
1504                 }
1505             }
1506         }
1507
1508         return node;
1509     }
1510
1511     public void dropSections(Lexer lexer, Node node)
1512     {
1513         while (node != null)
1514         {
1515             if (node.type == Node.SectionTag)
1516             {
1517                 /* prune up to matching endif */
1518                 if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
1519                 {
1520                     node = pruneSection(lexer, node);
1521                     continue;
1522                 }
1523
1524                 /* discard others as well */
1525                 node = Node.discardElement(node);
1526                 continue;
1527             }
1528
1529             if (node.content != null)
1530                 dropSections(lexer, node.content);
1531
1532             node = node.next;
1533         }
1534     }
1535
1536     public void purgeAttributes(Node node)
1537     {
1538         AttVal attr = node.attributes;
1539         AttVal next = null;
1540         AttVal prev = null;
1541
1542         while (attr != null)
1543         {
1544             next = attr.next;
1545
1546             /* special check for class="Code" denoting pre text */
1547             if (attr.attribute != null &&
1548                 attr.value != null &&
1549                 attr.attribute.equals("class") &&
1550                 attr.value.equals("Code"))
1551             {
1552                 prev = attr;
1553             }
1554             else if (attr.attribute != null &&
1555                 (attr.attribute.equals("class") ||
1556                  attr.attribute.equals("style") ||
1557                  attr.attribute.equals("lang") ||
1558                  attr.attribute.startsWith("x:") ||
1559                  ((attr.attribute.equals("height") || attr.attribute.equals("width")) &&
1560                     (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh))))
1561             {
1562                 if (prev != null)
1563                     prev.next = next;
1564                 else
1565                     node.attributes = next;
1566
1567             }
1568             else
1569                 prev = attr;
1570
1571             attr = next;
1572         }
1573     }
1574
1575     /* Word2000 uses span excessively, so we strip span out */
1576     public Node stripSpan(Lexer lexer, Node span)
1577     {
1578         Node node;
1579         Node prev = null;
1580         Node content;
1581
1582         /*
1583          deal with span elements that have content
1584          by splicing the content in place of the span
1585          after having processed it
1586         */
1587
1588         cleanWord2000(lexer, span.content);
1589         content = span.content;
1590
1591         if (span.prev != null)
1592             prev = span.prev;
1593         else if (content != null)
1594         {
1595             node = content;
1596             content = content.next;
1597             Node.removeNode(node);
1598             Node.insertNodeBeforeElement(span, node);
1599             prev = node;
1600         }
1601
1602         while (content != null)
1603         {
1604             node = content;
1605             content = content.next;
1606             Node.removeNode(node);
1607             Node.insertNodeAfterElement(prev, node);
1608             prev = node;
1609         }
1610
1611         if (span.next == null)
1612             span.parent.last = prev;
1613
1614         node = span.next;
1615         span.content = null;
1616         Node.discardElement(span);
1617         return node;
1618     }
1619
1620     /* map non-breaking spaces to regular spaces */
1621     private void normalizeSpaces(Lexer lexer, Node node)
1622     {
1623         while (node != null)
1624         {
1625             if (node.content != null)
1626                 normalizeSpaces(lexer, node.content);
1627
1628             if (node.type == Node.TextNode)
1629             {
1630                 int i;
1631                 MutableInteger c = new MutableInteger();
1632                 int p = node.start;
1633
1634                 for (i = node.start; i < node.end; ++i)
1635                 {
1636                     c.value = (int)node.textarray[i];
1637
1638                     /* look for UTF-8 multibyte character */
1639                     if (c.value > 0x7F)
1640                         i += PPrint.getUTF8(node.textarray, i, c);
1641
1642                     if (c.value == 160)
1643                         c.value = ' ';
1644
1645                     p = PPrint.putUTF8(node.textarray, p, c.value);
1646                 }
1647             }
1648
1649             node = node.next;
1650         }
1651     }
1652
1653     /*
1654      This is a major clean up to strip out all the extra stuff you get
1655      when you save as web page from Word 2000. It doesn't yet know what
1656      to do with VML tags, but these will appear as errors unless you
1657      declare them as new tags, such as o:p which needs to be declared
1658      as inline.
1659     */
1660     public void cleanWord2000(Lexer lexer, Node node)
1661     {
1662         /* used to a list from a sequence of bulletted p's */
1663         Node list = null;
1664
1665         while (node != null)
1666         {
1667             /* discard Word's style verbiage */
1668             if (node.tag == tt.tagStyle ||
1669                 node.tag == tt.tagMeta ||
1670                 node.type == Node.CommentTag)
1671             {
1672                 node = Node.discardElement(node);
1673                 continue;
1674             }
1675
1676             /* strip out all span tags Word scatters so liberally! */
1677             if (node.tag == tt.tagSpan)
1678             {
1679                 node = stripSpan(lexer, node);
1680                 continue;
1681             }
1682
1683             /* get rid of Word's xmlns attributes */
1684             if (node.tag == tt.tagHtml)
1685             {
1686                 /* check that it's a Word 2000 document */
1687                 if (node.getAttrByName("xmlns:o") == null)
1688                     return;
1689             }
1690
1691             if (node.tag == tt.tagLink)
1692             {
1693                 AttVal attr = node.getAttrByName("rel");
1694
1695                 if (attr != null && attr.value != null &&
1696                     attr.value.equals("File-List"))
1697                 {
1698                     node = Node.discardElement(node);
1699                     continue;
1700                 }
1701             }
1702
1703             /* discard empty paragraphs */
1704             if (node.content == null && node.tag == tt.tagP)
1705             {
1706                 node = Node.discardElement(node);
1707                 continue;
1708             }
1709
1710             if (node.tag == tt.tagP)
1711             {
1712                 AttVal attr = node.getAttrByName("class");
1713
1714                 /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
1715                 if (attr != null && attr.value != null &&
1716                     attr.value.equals("MsoListBullet"))
1717                 {
1718                     Node.coerceNode(lexer, node, tt.tagLi);
1719
1720                     if (list == null || list.tag != tt.tagUl)
1721                     {
1722                         list = lexer.inferredTag("ul");
1723                         Node.insertNodeBeforeElement(node, list);
1724                     }
1725
1726                     purgeAttributes(node);
1727
1728                     if (node.content != null)
1729                         cleanWord2000(lexer, node.content);
1730
1731                     /* remove node and append to contents of list */
1732                     Node.removeNode(node);
1733                     Node.insertNodeAtEnd(list, node);
1734                     node = list.next;
1735                 }
1736                 /* map sequence of <p class="Code"> to <pre>...</pre> */
1737                 else if (attr != null && attr.value != null &&
1738                          attr.value.equals("Code"))
1739                 {
1740                     Node br = lexer.newLineNode();
1741                     normalizeSpaces(lexer, node);
1742
1743                     if (list == null || list.tag != tt.tagPre)
1744                     {
1745                         list = lexer.inferredTag("pre");
1746                         Node.insertNodeBeforeElement(node, list);
1747                     }
1748
1749                     /* remove node and append to contents of list */
1750                     Node.removeNode(node);
1751                     Node.insertNodeAtEnd(list, node);
1752                     stripSpan(lexer, node);
1753                     Node.insertNodeAtEnd(list, br);
1754                     node = list.next;
1755                 }
1756                 else
1757                     list = null;
1758             }
1759             else
1760                 list = null;
1761
1762             /* strip out style and class attributes */
1763             if (node.type == Node.StartTag || node.type == Node.StartEndTag)
1764                 purgeAttributes(node);
1765
1766             if (node.content != null)
1767                 cleanWord2000(lexer, node.content);
1768
1769             node = node.next;
1770         }
1771     }
1772
1773     public boolean isWord2000(Node root, TagTable tt)
1774     {
1775         Node html = root.findHTML(tt);
1776
1777         return (html != null && html.getAttrByName("xmlns:o") != null);
1778     }
1779 }