package org.plog4u.wiki.filter; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; import java.util.NoSuchElementException; import java.util.Stack; import java.util.StringTokenizer; //import org.apache.commons.logging.Log; //import org.apache.commons.logging.LogFactory; import org.plog4u.wiki.filter.WikipediaFilter.InvalidInputException; import org.plog4u.wiki.filter.tags.AbstractTag; import org.plog4u.wiki.filter.tags.CloseTagToken; import org.plog4u.wiki.filter.tags.ListToken; import org.plog4u.wiki.filter.tags.OpenTagToken; import org.plog4u.wiki.filter.tags.SpecialTagToken; import org.radeox.api.engine.ImageRenderEngine; import org.radeox.api.engine.IncludeRenderEngine; import org.radeox.api.engine.RenderEngine; import org.radeox.api.engine.WikiRenderEngine; import org.radeox.filter.context.FilterContext; import org.radeox.filter.interwiki.InterWiki; import org.radeox.macro.Macro; import org.radeox.macro.MacroRepository; import org.radeox.macro.parameter.MacroParameter; import org.radeox.util.Encoder; import org.radeox.util.StringBufferWriter; /** * A parser for the WikipediaFilter * * @see org.plog4u.wiki.filter.WikipediaFilter */ public class WikipediaParser { // private static Log log = LogFactory.getLog(WikipediaFilter.class); MacroRepository fMacros; private FilterContext fContext; private RenderEngine fWikiEngine; // TODO check, if this counter is correct in recursions: private int fImageCounter; /** * The current snip */ // private Snip fSnip; /** * If the snip contains headings for a "table of content" this buffer temporarily contains the start of the snip and the * "table of content" */ private StringBuffer fResultBufferHeader = null; /** * The buffer for the resulting HTML rendering from the current snip. */ private StringBuffer fResultBuffer; /** * The wiki syntax string which should be parsed */ private char[] fSource; /** * The corresponding String for the character source array */ private final String fStringSource; /** * The current scanned character */ private char fCurrentCharacter; /** * The current offset in the character source array */ private int fCurrentPosition; /** * The current recursion level for this parser */ private int fRecursionLevel; private Stack fTokenStack; // private Stack fTableStack; private boolean fWhiteStart = false; private int fWhiteStartPosition = 0; // private TeXParser fTeXParser; // private TeXParser fTeXImageParser; /** * * "table of content" * */ private ArrayList fTableOfContent = null; // private String fSrcPath; // private String fBinPath; public WikipediaParser(MacroRepository macros, String stringSource, StringBuffer result, FilterContext context, int recursionLevel) { fContext = context; fWikiEngine = context.getRenderContext().getRenderEngine(); // try { // SnipMacroParameter params = (SnipMacroParameter) // fContext.getMacroParameter(); // fSnip = params.getSnipRenderContext().getSnip(); // } catch (ClassCastException e) { // e.printStackTrace(); // } fMacros = macros; fResultBuffer = result; fStringSource = stringSource; setSource(stringSource.toCharArray()); fRecursionLevel = recursionLevel; fTokenStack = new Stack(); // fTableStack = new Stack(); // fTeXParser = new TeXParser("", "m:"); // fTeXImageParser = new TeXParser("", ""); fImageCounter = 1; // fSrcPath = (String) fContext.getRenderContext().get("srcpath"); // if (fSrcPath==null) { // fSrcPath = ""; // } // fBinPath = (String) fContext.getRenderContext().get("binpath"); // if (fBinPath==null) { // fBinPath = ""; // } } /** * Check until a new-line was found, if there are only whitespace characters before the given endposition. * * @param startPosition * @param endPosition * @return -1 if no whitespace line is found from the end (i.e. endPosition); otherwise the offset directly after where the * new-line was found */ private int checkWhitespaces(int startPosition, int endPosition) { char tempChar; while (endPosition >= startPosition) { if ((tempChar = fSource[endPosition--]) == '\n') { return endPosition + 2; } if (tempChar != ' ' && tempChar != '\t' && tempChar != '\r') { return -1; } } if (endPosition < startPosition && endPosition >= 0) { if ((tempChar = fSource[endPosition]) != '\n') { return -1; } } else if (endPosition == (-1) && startPosition == 0) { // special case at the start of a string return 0; } return startPosition; } /** * copy the content in the resulting buffer and escape special html characters (< > " & ') */ private void copyWhite(boolean whiteStart, final int whiteStartPosition, final int diff) { if (whiteStart) { final int len = fCurrentPosition - diff; int currentIndex = whiteStartPosition; int lastIndex = currentIndex; while (currentIndex < len) { switch (fSource[currentIndex++]) { case '<': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(fSource, lastIndex, currentIndex - lastIndex - 1); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append("<"); break; case '>': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(fSource, lastIndex, currentIndex - lastIndex - 1); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append(">"); break; case '&': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(fSource, lastIndex, currentIndex - lastIndex - 1); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append("&"); break; case '\'': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(fSource, lastIndex, currentIndex - lastIndex - 1); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append("'"); break; case '\"': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(fSource, lastIndex, currentIndex - lastIndex - 1); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append("""); break; } } if (lastIndex < (currentIndex)) { fResultBuffer.append(fSource, lastIndex, currentIndex - lastIndex); } fWhiteStart = false; } } /** * copy the text in the resulting buffer and escape special html characters (< > " & ') */ private void copyWhite(String text) { final int len = text.length(); int currentIndex = 0; int lastIndex = currentIndex; while (currentIndex < len) { switch (text.charAt(currentIndex++)) { case '<': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(text.substring(lastIndex, currentIndex - 1)); lastIndex = currentIndex; } fResultBuffer.append("<"); break; case '>': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(text.substring(lastIndex, currentIndex - 1)); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append(">"); break; case '&': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(text.substring(lastIndex, currentIndex - 1)); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append("&"); break; case '\'': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(text.substring(lastIndex, currentIndex - 1)); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append("'"); break; case '\"': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(text.substring(lastIndex, currentIndex - 1)); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append("""); break; } } if (lastIndex < (currentIndex)) { fResultBuffer.append(text.substring(lastIndex, currentIndex)); } } /** * Copy the text in the resulting buffer and escape special html characters (< > " & ') Additionally every * newline will be replaced by <br/> */ private void copyNowikiNewLine(String text) { final int len = text.length(); int currentIndex = 0; int lastIndex = currentIndex; while (currentIndex < len) { switch (text.charAt(currentIndex++)) { case '\n': if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(text.substring(lastIndex, currentIndex - 1)); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append("
"); break; case '<': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(text.substring(lastIndex, currentIndex - 1)); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append("<"); break; case '>': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(text.substring(lastIndex, currentIndex - 1)); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append(">"); break; // case '&': // special html escape character // if (lastIndex < (currentIndex - 1)) { // fResultBuffer.append(text.substring(lastIndex, currentIndex - 1)); // lastIndex = currentIndex; // } else { // lastIndex++; // } // fResultBuffer.append("&"); // break; case '\'': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(text.substring(lastIndex, currentIndex - 1)); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append("'"); break; case '\"': // special html escape character if (lastIndex < (currentIndex - 1)) { fResultBuffer.append(text.substring(lastIndex, currentIndex - 1)); lastIndex = currentIndex; } else { lastIndex++; } fResultBuffer.append("""); break; } } if (lastIndex < (currentIndex)) { fResultBuffer.append(text.substring(lastIndex, currentIndex)); } } /** * Render the HTML token which are defined in the OPEN_TAGS and CLOSE_TAGS map * * @return */ public int getHTMLToken() { int currentHtmlPosition = fCurrentPosition; try { char closeCharacter; char nextCharacter; if (getNextChar('/')) { // end tag detected currentHtmlPosition++; // closing tag int r = readUntilCharOrEOL('>'); if (r != 1) { return WikipediaFilter.TokenNotFound; } String closeTagString = new String(fSource, currentHtmlPosition, fCurrentPosition - currentHtmlPosition - 1).toLowerCase(); // System.out.println(closeTagString); StringTokenizer tagTokenizer = new StringTokenizer(closeTagString); String tokenString; try { tokenString = tagTokenizer.nextToken(); CloseTagToken token = (CloseTagToken) WikipediaFilter.CLOSE_TAGS.get(tokenString); if (token == null) { return WikipediaFilter.TokenNotFound; } Object topToken = fTokenStack.peek(); if (topToken instanceof OpenTagToken && ((OpenTagToken) topToken).getTagName() == token.getTagName()) { fTokenStack.pop(); // if (token.getTagName().equals("table")) { // fTableStack.pop(); // } copyWhite(fWhiteStart, fWhiteStartPosition, 3 + tokenString.length()); fWhiteStart = false; fResultBuffer.append(token.getCloseTag()); return WikipediaFilter.TokenIgnore; } fWhiteStart = false; unexpectedTag(token.getTagName()); return WikipediaFilter.TokenIgnore; } catch (NoSuchElementException e) { return WikipediaFilter.TokenNotFound; } } else { // start tag String tokenString; int tagNameStart = fCurrentPosition; int tokenLength = 0; while (Character.isJavaIdentifierStart(fSource[fCurrentPosition])) { fCurrentPosition++; tokenLength++; } try { tokenString = new String(fSource, tagNameStart, fCurrentPosition - tagNameStart); //tagTokenizer.nextToken(); OpenTagToken token = (OpenTagToken) WikipediaFilter.OPEN_TAGS.get(tokenString); if (token == null) { return WikipediaFilter.TokenNotFound; } copyWhite(fWhiteStart, fWhiteStartPosition, (fCurrentPosition - tagNameStart) + 1); fWhiteStart = false; if (token instanceof SpecialTagToken) { // for




while (Character.isWhitespace(fSource[fCurrentPosition])) { fCurrentPosition++; } if (fSource[fCurrentPosition] == '/') { fCurrentPosition++; } if (fSource[fCurrentPosition] == '>') { fCurrentPosition++; fWhiteStartPosition = fCurrentPosition; // insert the special tag : fResultBuffer.append(token.getOpenTag()); return WikipediaFilter.TokenIgnore; } } else if (token instanceof OpenTagToken) { fResultBuffer.append("<"); fResultBuffer.append(token.getTagName()); fTokenStack.push(token); fCurrentPosition = token.scan(fResultBuffer, fSource, fCurrentPosition); fResultBuffer.append(">"); return WikipediaFilter.TokenIgnore; } return WikipediaFilter.TokenNotFound; } catch (NoSuchElementException e) { return WikipediaFilter.TokenNotFound; } } } catch (IndexOutOfBoundsException e) { // } fCurrentPosition = currentHtmlPosition; return WikipediaFilter.TokenNotFound; } public final boolean getNextChar(char testedChar) { int temp = fCurrentPosition; try { fCurrentCharacter = fSource[fCurrentPosition++]; if (fCurrentCharacter != testedChar) { fCurrentPosition = temp; return false; } return true; } catch (IndexOutOfBoundsException e) { fCurrentPosition = temp; return false; } } public final int getNextChar(char testedChar1, char testedChar2) { int temp = fCurrentPosition; try { int result; fCurrentCharacter = fSource[fCurrentPosition++]; if (fCurrentCharacter == testedChar1) result = 0; else if (fCurrentCharacter == testedChar2) result = 1; else { fCurrentPosition = temp; return -1; } return result; } catch (IndexOutOfBoundsException e) { fCurrentPosition = temp; return -1; } } public final boolean getNextCharAsDigit() { int temp = fCurrentPosition; try { fCurrentCharacter = fSource[fCurrentPosition++]; if (!Character.isDigit(fCurrentCharacter)) { fCurrentPosition = temp; return false; } return true; } catch (IndexOutOfBoundsException e) { fCurrentPosition = temp; return false; } } public final boolean getNextCharAsDigit(int radix) { int temp = fCurrentPosition; try { fCurrentCharacter = fSource[fCurrentPosition++]; if (Character.digit(fCurrentCharacter, radix) == -1) { fCurrentPosition = temp; return false; } return true; } catch (IndexOutOfBoundsException e) { fCurrentPosition = temp; return false; } } public final int getNumberOfChar(char testedChar) { int number = 0; try { while ((fCurrentCharacter = fSource[fCurrentPosition++]) == testedChar) { number++; } } catch (IndexOutOfBoundsException e) { } fCurrentPosition--; return number; } public final char[] getListChars() { int startPosition = fCurrentPosition - 1; try { while (true) { fCurrentCharacter = fSource[fCurrentPosition++]; if (fCurrentCharacter != '*' && fCurrentCharacter != '#') { break; } } } catch (IndexOutOfBoundsException e) { // } fCurrentPosition--; char[] result = new char[fCurrentPosition - startPosition]; System.arraycopy(fSource, startPosition, result, 0, fCurrentPosition - startPosition); return result; } public boolean getNextCharAsWikiPluginIdentifierPart() { int temp = fCurrentPosition; try { fCurrentCharacter = fSource[fCurrentPosition++]; if (!WikipediaFilter.isWikiPluginIdentifierPart(fCurrentCharacter)) { fCurrentPosition = temp; return false; } return true; } catch (IndexOutOfBoundsException e) { fCurrentPosition = temp; return false; } } private void stopList() { while (!fTokenStack.isEmpty()) { AbstractTag tok = (AbstractTag) fTokenStack.peek(); if (tok.equals(WikipediaFilter.LIST_UL_START)) { fTokenStack.pop(); fResultBuffer.append(""); } else if (tok.equals(WikipediaFilter.LIST_OL_START)) { fTokenStack.pop(); fResultBuffer.append(""); } else if (tok == WikipediaFilter.BOLD) { fTokenStack.pop(); fResultBuffer.append(""); } else if (tok == WikipediaFilter.ITALIC) { fTokenStack.pop(); fResultBuffer.append(""); } else if (tok == WikipediaFilter.STRONG) { fTokenStack.pop(); fResultBuffer.append(""); } else if (tok == WikipediaFilter.EM) { fTokenStack.pop(); fResultBuffer.append(""); } else if (tok == WikipediaFilter.STRIKETHROUGH) { fTokenStack.pop(); fResultBuffer.append(""); } else { break; } } } protected int getNextToken() throws InvalidInputException { boolean startOfIndent = false; fWhiteStartPosition = 0; fWhiteStart = false; try { while (true) { // fStartPosition = fCurrentPosition; fCurrentCharacter = fSource[fCurrentPosition++]; // ---------Identify the next token------------- switch (fCurrentCharacter) { case '\n': if (fWhiteStart) { int tempPosition = checkWhitespaces(fWhiteStartPosition, fCurrentPosition - 2); if (tempPosition >= 0) { copyWhite(fWhiteStart, fWhiteStartPosition, fCurrentPosition - (++tempPosition)); fWhiteStart = false; stopList(); fResultBuffer.append("

"); // continue; } } int fStartPrePosition = fCurrentPosition; boolean preSection = false; try { while (fSource[fCurrentPosition++] == ' ') { fCurrentCharacter = fSource[fCurrentPosition++]; while (fCurrentCharacter != '\n') { if (!Character.isWhitespace(fCurrentCharacter)) { // preformatted section starts here preSection = true; } fCurrentCharacter = fSource[fCurrentPosition++]; } } --fCurrentPosition; } catch (IndexOutOfBoundsException e) { } if (preSection && fRecursionLevel == 1) { String preString; copyWhite(fWhiteStart, fStartPrePosition, fCurrentPosition - fStartPrePosition); fWhiteStart = true; fResultBuffer.append("

");
            //            copyWhite(fWhiteStart, fStartPrePosition, 1);
            preString = new String(fSource, fStartPrePosition, fCurrentPosition - fStartPrePosition - 1) + '\n';
            fResultBuffer.append(WikipediaFilter.filterParser(preString, fContext, fMacros, fRecursionLevel));
            //            preString = new String(fSource, fStartPrePosition, fCurrentPosition - fStartPrePosition - 1)+'\n';
            //            int preIndex = 0;
            //            int lastIndex = 0;
            //            while (preIndex>=0) {
            //              preIndex = preString.indexOf('\n', lastIndex);
            //              if (preIndex>=0) {
            //                fResultBuffer.append(WikipediaFilter.filterParser(preString.substring(lastIndex,preIndex), fContext,
            // fCachedPage, fMacros, fRecursionLevel));
            //                fResultBuffer.append('\n');
            //                lastIndex = ++preIndex;
            //              }
            //            }
            fResultBuffer.append("
"); fWhiteStart = false; continue; } else { fCurrentPosition = fStartPrePosition; } break; case ':': if (isStartOfLine()) { copyWhite(fWhiteStart, fWhiteStartPosition, 1); fWhiteStart = false; int levelHeader = getNumberOfChar(':') + 1; int startHeadPosition = fCurrentPosition; if (readUntilEOL()) { String head = new String(fSource, startHeadPosition, fCurrentPosition - startHeadPosition); for (int i = 0; i < levelHeader; i++) { fResultBuffer.append("
"); } fResultBuffer.append(head); for (int i = 0; i < levelHeader; i++) { fResultBuffer.append("
"); } continue; } continue; } break; case ';': if (isStartOfLine() && getNextChar(' ')) { copyWhite(fWhiteStart, fWhiteStartPosition, 1); fWhiteStart = false; int startHeadPosition = fCurrentPosition; if (readUntilEOL()) { // TODO not correct - improve this String head = new String(fSource, startHeadPosition, fCurrentPosition - startHeadPosition); int index = head.indexOf(": "); if (index > 0) { fResultBuffer.append("
"); fResultBuffer.append(head.substring(0, index)); fResultBuffer.append("
"); fResultBuffer.append(head.substring(index + 2)); fResultBuffer.append("
"); } else { fResultBuffer.append("
"); fResultBuffer.append(head); fResultBuffer.append("
"); } continue; } continue; } break; case '[': int startLinkPosition = fCurrentPosition; if (getNextChar('[')) { // wikipedia link style startLinkPosition = fCurrentPosition; copyWhite(fWhiteStart, fWhiteStartPosition, 2); fWhiteStart = false; if (readUntilString("]]")) { String name = new String(fSource, startLinkPosition, fCurrentPosition - startLinkPosition - 2); // test for suffix string int temp = fCurrentPosition; StringBuffer suffixBuffer = new StringBuffer(); try { while (true) { fCurrentCharacter = fSource[fCurrentPosition++]; if (!Character.isLetterOrDigit(fCurrentCharacter)) { fCurrentPosition--; break; } suffixBuffer.append(fCurrentCharacter); } handleWikipediaLink(name, suffixBuffer.toString()); continue; } catch (IndexOutOfBoundsException e) { fCurrentPosition = temp; } handleWikipediaLink(name, ""); continue; } } else { copyWhite(fWhiteStart, fWhiteStartPosition, 1); fWhiteStart = false; if (readUntilChar(']')) { String name = new String(fSource, startLinkPosition, fCurrentPosition - startLinkPosition - 1); handleSnipLink(name); continue; } } break; case '*': //
  1. "); } else if (listToken.getToken() == WikipediaFilter.TokenLIST_OL_START && listChars[topLevel] == '*') { fTokenStack.pop(); fTokenStack.push(new ListToken(WikipediaFilter.TokenLIST_UL_START, topLevel)); fResultBuffer.append("