X-Git-Url: http://git.phpeclipse.com diff --git a/archive/net.sourceforge.phpeclipse.quantum.sql/src/com/quantum/sql/parser/SQLLexx.java b/archive/net.sourceforge.phpeclipse.quantum.sql/src/com/quantum/sql/parser/SQLLexx.java index e901266..8c83886 100644 --- a/archive/net.sourceforge.phpeclipse.quantum.sql/src/com/quantum/sql/parser/SQLLexx.java +++ b/archive/net.sourceforge.phpeclipse.quantum.sql/src/com/quantum/sql/parser/SQLLexx.java @@ -2,10 +2,36 @@ package com.quantum.sql.parser; import java.util.Vector; +/** + *
An SQL Lexer. From + * dictionary.com: + * + *
+ *+ * + *lexer + * + *
/lek'sr/ n. Common hacker shorthand for 'lexical + * analyzer', the input-tokenizing stage in the parser for a language + * (the part that breaks it into word-like pieces). + *
Note that this class has nothing to do with the Sci-fi channel's + * Lexx TV series. + */ public class SQLLexx { - private static String endline = ";"; //$NON-NLS-1$ - private static String dash = "-"; //$NON-NLS-1$ - private static String group = "/"; //$NON-NLS-1$ + private final static char CHAR_EOL = '\n'; + private final static char CHAR_DASH = '-'; + private final static char CHAR_ESCAPE = '\\'; + private final static char CHAR_SEPARATOR = ';'; + + private final static int CONDITION_WHITESPACE = 1; + private final static int CONDITION_IDENTIFIER = 2; + private final static int CONDITION_IDENTIFIER_INITIAL = 3; + private final static int CONDITION_LITERAL_SIMPLE_QUOTE = 4; + private final static int CONDITION_LITERAL_DOUBLE_QUOTE = 5; + private final static int CONDITION_NUMERIC = 6; + private final static int CONDITION_EOL = 7; + + /** * Parses a SQL text into tokens. * @param text @@ -19,134 +45,75 @@ public class SQLLexx { int offset = p.getOffset(); char c = p.getNext(); // Adds END_OF_LINE token - if (c == '\n') { - tokens.addElement(new Token(Token.END_OF_LINE, "\n", offset, offset + 1)); + if (c == CHAR_EOL) { + tokens.addElement(new Token(Token.END_OF_LINE, CHAR_EOL, offset)); } // Adds WHITESPACE token; - else if (Character.isWhitespace(c)) { - StringBuffer value = new StringBuffer(); - while (Character.isWhitespace(c) && !p.isDone()) { - value.append(c); - c = p.getNext(); - } - // done because of is done - if (Character.isWhitespace(c)) { - value.append(c); - } else if (!p.isDone()){ - p.back(); - } + else if (CheckCondition( c, CONDITION_WHITESPACE)) + { + StringBuffer value = AddTokenWhile(p, c, CONDITION_WHITESPACE); tokens.addElement(new Token(Token.WHITESPACE, value.toString(), offset, offset + value.length())); // Adds IDENTIFIER token (can be reserved SQL word or not); - } else if (Character.isLetter(c) || c == '_' || c == '$') { - StringBuffer value = new StringBuffer(); - while ((Character.isLetterOrDigit(c) || c == '_' || c == '$') && !p.isDone()) { - value.append(c); - c = p.getNext(); - } - if ((Character.isLetterOrDigit(c) || c == '_')) { - value.append(c); - } else if (!p.isDone()){ - p.back(); - } + } else if (CheckCondition( c , CONDITION_IDENTIFIER_INITIAL)) + { + StringBuffer value = AddTokenWhile(p, c, CONDITION_IDENTIFIER); tokens.addElement(new Token(Token.IDENTIFIER, value.toString(), offset, offset + value.length())); // Adds LITERAL token; - } else if (c == '\'') { - StringBuffer value = new StringBuffer(); - value.append(c); - if (!p.isDone()) { - c = p.getNext(); - while (c != '\'' && c != '\n' && !p.isDone()) { - value.append(c); - c = p.getNext(); - } - if (c == '\'' || p.isDone()) { - value.append(c); - } else if (!p.isDone()){ - p.back(); - } - } + } else if (CheckCondition(c, CONDITION_LITERAL_SIMPLE_QUOTE)) { + StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE); tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length())); - // Adds COMMENT token (or SYMBOL (dash) if only one dash); - } else if (c == '-') { - p.mark(); + // Adds LITERAL token; + } else if (CheckCondition(c, CONDITION_LITERAL_DOUBLE_QUOTE)) { + StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE); + tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length())); + // Adds NUMERIC token; + } else if (Character.isDigit(c)) { + StringBuffer value = AddTokenWhile(p, c, CONDITION_NUMERIC); + tokens.addElement(new Token(Token.NUMERIC, value.toString(), offset, offset + value.length())); + // Adds COMMENT token if two dashes (or SYMBOL (dash) if only one dash); + } else if (c == CHAR_DASH) { if (p.isDone()) { - tokens.addElement(new Token(Token.SYMBOL, dash, offset, offset + 1)); + tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1)); } else { - char next = p.getNext(); - if (next == '-') { - StringBuffer value = new StringBuffer("--"); //$NON-NLS-1$ - if (!p.isDone()) { - c = p.getNext(); - while (c != '\n' && !p.isDone()) { - value.append(c); - c = p.getNext(); - } - if (p.isDone()) { - value.append(c); - } else { - p.back(); - } - } + char next = p.peek(); + if (next == CHAR_DASH) { + StringBuffer value = AddTokenUntil(p, CHAR_DASH, CONDITION_EOL); tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length())); } else { - tokens.addElement(new Token(Token.SYMBOL, dash, offset, offset + 1)); - p.reset(); + tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1)); } + } + // Determine if the ';' is escaped or not + } else if (c == CHAR_ESCAPE) { + if (p.peek() == CHAR_SEPARATOR) { + p.getNext(); // We advance the pointer so the separator is not marked again + // We DON´T SAVE the scape character in the tokens. + // For correct sintax highlighting we set the offset to +2 + // This is so far the only case when a character is eliminated and not saved to the tokens. + // That means it won´t be sent to the database when executed. + // This is to allow definitions of procedures with ';' as an end-of-sentence, + // not as an execution symbol for SQL. + tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_SEPARATOR).toString() , offset, offset + 2)); + } else { + tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_ESCAPE).toString() , offset, offset + 1)); } // Adds SEPARATOR token (;), considers the rest of the line as COMMENT token; - } else if (c == ';') { - tokens.addElement(new Token(Token.SEPARATOR, endline, offset, offset + 1)); - StringBuffer value = new StringBuffer(); + } else if (c == CHAR_SEPARATOR) { + tokens.addElement(new Token(Token.SEPARATOR, new Character(CHAR_SEPARATOR).toString(), offset, offset + 1)); + // The rest of the line will be a comment if (!p.isDone()) { - c = p.getNext(); - while (c != '\n' && !p.isDone()) { - value.append(c); - c = p.getNext(); - } - if (p.isDone()) { - value.append(c); - } else { - p.back(); - } - // We add to the offset so as to skip the initial ';' + StringBuffer value = AddTokenUntil(p, "", CONDITION_EOL); + // We add to the offset so as to skip the initial ';' offset++; tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length())); } - // Adds NUMERIC token; - } else if (Character.isDigit(c)) { - StringBuffer value = new StringBuffer(); - while ((Character.isDigit(c) || c == '.') && !p.isDone()) { - value.append(c); - c = p.getNext(); - } - if ((Character.isDigit(c) || c == '.')) { - value.append(c); - } else { - p.back(); - } - tokens.addElement(new Token(Token.NUMERIC, value.toString(), offset, offset + value.length())); - // Adds COMMENT token (or GROUP (slash) if only one slash); + // Adds COMMENT token, for several lines; } else if (c == '/') { - p.mark(); // If we have '/*', it's a comment till '*/' found or eof if (p.peek() == '*') { - StringBuffer value = new StringBuffer(); - c = p.getNext(); - value.append('/'); - while (!( c == '*' && p.peek() == '/' ) && !p.isDone()) { - value.append(c); - c = p.getNext(); - } - if (!p.isDone()){ - value.append(c); - c = p.getNext(); - value.append(c); - } - tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length())); + tokens.addElement(tokenizeComment(p, offset)); } else { - // It's not '/*' , so it's a group token - tokens.addElement(new Token(Token.GROUP, group, offset, offset + 1)); - p.reset(); + tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}) , offset, offset + 1)); } // Adds SYMBOL token; } else { @@ -163,4 +130,103 @@ public class SQLLexx { // } return tokens; } + /** + * Searchs for a token end, UNTIL the condition is true, or a newline, or the end of the StringPointer + * The end character is also addedd to the StringBuffer + * @param p + * @param s A string with the first character from the token, already extracted from the StringPointer + * @param condition + * @return a StringBuffer with the complete token + */ + private static StringBuffer AddTokenUntil(StringPointer p, String s, int condition) { + StringBuffer value = new StringBuffer(s); + if (p.isDone()) return value; + for(;;) { + char c = p.getNext(); + if (c != CHAR_EOL) value.append(c); + if (CheckCondition (c, condition) || c == CHAR_EOL || p.isDone()) { + break; + } + } + return value; + } + private static StringBuffer AddTokenUntil(StringPointer p, char c, int condition) { + return AddTokenUntil(p, new Character(c).toString(), condition); + } + /** + * Searchs for a token end, WHILE the condition is true, or the end or the StringPointer. + * @param p The StringPointer where the original stream is + * @param s A string with the first character from the token, already extracted from the StringPointer + * @param condition The condition to end the token + * @return a StringBuffer with the complete token + */ + private static StringBuffer AddTokenWhile(StringPointer p, String s, int condition) { + StringBuffer value = new StringBuffer(s); + if (p.isDone()) return value; + for(;;) { + char c = p.getNext(); + if (CheckCondition (c, condition)) { + value.append(c); + if (p.isDone()) break; + } + else + { + p.back(); + break; + } + } + return value; + } + private static StringBuffer AddTokenWhile(StringPointer p, char c, int condition) { + return AddTokenWhile(p, new Character(c).toString(), condition); + } + /** + * Returns true if the character meets the condition, and false if not. + * New conditions should be defined in this function + * @param c The character to check the condition + * @param condition The condition to check + * @return + */ + private static boolean CheckCondition(char c, int condition) { + switch (condition) { + case CONDITION_WHITESPACE: + return Character.isWhitespace(c); + case CONDITION_IDENTIFIER_INITIAL: + return (Character.isLetter(c) || c == '$' || c == '#'); + case CONDITION_IDENTIFIER: + return (Character.isLetter(c) || Character.isDigit(c) || c == '_' || c == '$' || c == '#'); + case CONDITION_LITERAL_SIMPLE_QUOTE: + return (c == '\''); + case CONDITION_LITERAL_DOUBLE_QUOTE: + return (c == '\"'); + case CONDITION_NUMERIC: + return (Character.isDigit(c) || c == '.'); + case CONDITION_EOL: + return (c == CHAR_EOL); + default: + break; + } + return false; + } + /** + * @param tokens + * @param p + * @param offset + */ + private static Token tokenizeComment(StringPointer p, int offset) { + char c; + StringBuffer value = new StringBuffer(); + c = p.getNext(); + value.append('/'); + while (!( c == '*' && p.peek() == '/' ) && !p.isDone()) { + value.append(c); + c = p.getNext(); + } + if (!p.isDone()){ + value.append(c); + c = p.getNext(); + value.append(c); + } + return new Token(Token.COMMENT, value.toString(), offset, offset + value.length()); + } }