package com.quantum.sql.parser; import java.util.Vector; /** *

An SQL Lexer. From * dictionary.com: * *

*

lexer * *

/lek'sr/ n. Common hacker shorthand for 'lexical * analyzer', the input-tokenizing stage in the parser for a language * (the part that breaks it into word-like pieces). *

* *

Note that this class has nothing to do with the Sci-fi channel's * Lexx TV series. */ public class SQLLexx { private static String endline = ";"; //$NON-NLS-1$ private static String dash = "-"; //$NON-NLS-1$ private static String group = "/"; //$NON-NLS-1$ /** * Parses a SQL text into tokens. * @param text * @return a vector of Token objects. */ public static Vector parse(String text) { Vector tokens = new Vector(); StringPointer p = new StringPointer(text); try { while (!p.isDone()) { int offset = p.getOffset(); char c = p.getNext(); // Adds END_OF_LINE token if (c == '\n') { tokens.addElement(new Token(Token.END_OF_LINE, "\n", offset, offset + 1)); } // Adds WHITESPACE token; else if (Character.isWhitespace(c)) { StringBuffer value = new StringBuffer(); while (Character.isWhitespace(c) && !p.isDone()) { value.append(c); c = p.getNext(); } // done because of is done if (Character.isWhitespace(c)) { value.append(c); } else if (!p.isDone()){ p.back(); } tokens.addElement(new Token(Token.WHITESPACE, value.toString(), offset, offset + value.length())); // Adds IDENTIFIER token (can be reserved SQL word or not); } else if (Character.isLetter(c) || c == '_' || c == '$') { StringBuffer value = new StringBuffer(); while ((Character.isLetterOrDigit(c) || c == '_' || c == '$') && !p.isDone()) { value.append(c); c = p.getNext(); } if ((Character.isLetterOrDigit(c) || c == '_')) { value.append(c); } else if (!p.isDone()){ p.back(); } tokens.addElement(new Token(Token.IDENTIFIER, value.toString(), offset, offset + value.length())); // Adds LITERAL token; } else if (c == '\'') { StringBuffer value = new StringBuffer(); value.append(c); if (!p.isDone()) { c = p.getNext(); while (c != '\'' && c != '\n' && !p.isDone()) { value.append(c); c = p.getNext(); } if (c == '\'' || p.isDone()) { value.append(c); } else if (!p.isDone()){ p.back(); } } tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length())); // Adds COMMENT token (or SYMBOL (dash) if only one dash); } else if (c == '-') { p.mark(); if (p.isDone()) { tokens.addElement(new Token(Token.SYMBOL, dash, offset, offset + 1)); } else { char next = p.getNext(); if (next == '-') { StringBuffer value = new StringBuffer("--"); //$NON-NLS-1$ if (!p.isDone()) { c = p.getNext(); while (c != '\n' && !p.isDone()) { value.append(c); c = p.getNext(); } if (p.isDone()) { value.append(c); } else { p.back(); } } tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length())); } else { tokens.addElement(new Token(Token.SYMBOL, dash, offset, offset + 1)); p.reset(); } } // Adds SEPARATOR token (;), considers the rest of the line as COMMENT token; } else if (c == ';') { tokens.addElement(new Token(Token.SEPARATOR, endline, offset, offset + 1)); StringBuffer value = new StringBuffer(); if (!p.isDone()) { c = p.getNext(); while (c != '\n' && !p.isDone()) { value.append(c); c = p.getNext(); } if (p.isDone()) { value.append(c); } else { p.back(); } // We add to the offset so as to skip the initial ';' offset++; tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length())); } // Adds NUMERIC token; } else if (Character.isDigit(c)) { StringBuffer value = new StringBuffer(); while ((Character.isDigit(c) || c == '.') && !p.isDone()) { value.append(c); c = p.getNext(); } if ((Character.isDigit(c) || c == '.')) { value.append(c); } else { p.back(); } tokens.addElement(new Token(Token.NUMERIC, value.toString(), offset, offset + value.length())); // Adds COMMENT token (or GROUP (slash) if only one slash); } else if (c == '/') { p.mark(); // If we have '/*', it's a comment till '*/' found or eof if (p.peek() == '*') { tokens.addElement(tokenizeComment(p, offset)); } else { // It's not '/*' , so it's a group token // BCH ??? what's this business about groups? // Shouldn't '/' be a divide operator? tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}) /*group*/, offset, offset + 1)); p.reset(); } // Adds SYMBOL token; } else { tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}), offset, offset + 1)); } } } catch (RuntimeException e) { e.printStackTrace(); } // System.out.println("-------------------"); // for (int i = 0; i < tokens.size(); i++) { // System.out.println((Token) tokens.elementAt(i)); // } return tokens; } /** * @param tokens * @param p * @param offset */ private static Token tokenizeComment(StringPointer p, int offset) { char c; StringBuffer value = new StringBuffer(); c = p.getNext(); value.append('/'); while (!( c == '*' && p.peek() == '/' ) && !p.isDone()) { value.append(c); c = p.getNext(); } if (!p.isDone()){ value.append(c); c = p.getNext(); value.append(c); } return new Token(Token.COMMENT, value.toString(), offset, offset + value.length()); } }