archive/net.sourceforge.phpeclipse.quantum.sql/src/com/quantum/sql/parser/SQLLexx.java

   1 package com.quantum.sql.parser;
   2
   3 import java.util.Vector;
   4
   5 /**
   6  * <p>An SQL Lexer.  From
   7  * <a href="http://www.dictionary.com/">dictionary.com</a>:
   8  *
   9  * <blockquote>
  10  * <p><b>lexer</b>
  11  *
  12  * <p>/lek'sr/ n. Common hacker shorthand for 'lexical
  13  * analyzer', the input-tokenizing stage in the parser for a language
  14  * (the part that breaks it into word-like pieces).
  15  * </blockquote>
  16  *
  17  * <p>Note that this class has nothing to do with the Sci-fi channel's
  18  * <a href="http://www.scifi.com/lexx/">Lexx</a> TV series.
  19  */
  20 public class SQLLexx {
  21         private final static char CHAR_EOL = '\n';
  22         private final static char CHAR_DASH = '-';
  23         private final static char CHAR_ESCAPE = '\\';
  24         private final static char CHAR_SEPARATOR = ';';
  25
  26         private final static int CONDITION_WHITESPACE = 1;
  27         private final static int CONDITION_IDENTIFIER = 2;
  28         private final static int CONDITION_IDENTIFIER_INITIAL = 3;
  29         private final static int CONDITION_LITERAL_SIMPLE_QUOTE = 4;
  30         private final static int CONDITION_LITERAL_DOUBLE_QUOTE = 5;
  31         private final static int CONDITION_NUMERIC = 6;
  32         private final static int CONDITION_EOL = 7;
  33
  34
  35         /**
  36          * Parses a SQL text into tokens.
  37          * @param text
  38          * @return a vector of Token objects.
  39          */
  40         public static Vector parse(String text) {
  41                 Vector tokens = new Vector();
  42                 StringPointer p = new StringPointer(text);
  43                 try {
  44                         while (!p.isDone()) {
  45                                 int offset = p.getOffset();
  46                                 char c = p.getNext();
  47                                 // Adds END_OF_LINE token
  48                                 if (c == CHAR_EOL) {
  49                                         tokens.addElement(new Token(Token.END_OF_LINE, CHAR_EOL, offset));
  50                                 }
  51                                 // Adds WHITESPACE token;
  52                                 else if (CheckCondition( c, CONDITION_WHITESPACE))
  53                                 {
  54                                         StringBuffer value = AddTokenWhile(p, c, CONDITION_WHITESPACE);
  55                                         tokens.addElement(new Token(Token.WHITESPACE, value.toString(), offset, offset + value.length()));
  56                                 // Adds IDENTIFIER token (can be reserved SQL word or not);
  57                                 } else if (CheckCondition( c , CONDITION_IDENTIFIER_INITIAL))
  58                                 {
  59                                         StringBuffer value = AddTokenWhile(p, c, CONDITION_IDENTIFIER);
  60                                         tokens.addElement(new Token(Token.IDENTIFIER, value.toString(), offset, offset + value.length()));
  61                                 // Adds LITERAL token;
  62                                 } else if (CheckCondition(c, CONDITION_LITERAL_SIMPLE_QUOTE)) {
  63                                         StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE);
  64                                         tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length()));
  65                                 // Adds LITERAL token;
  66                                 } else if (CheckCondition(c, CONDITION_LITERAL_DOUBLE_QUOTE)) {
  67                                         StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE);
  68                                         tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length()));
  69                                 // Adds NUMERIC token;
  70                                 } else if (Character.isDigit(c)) {
  71                                         StringBuffer value = AddTokenWhile(p, c, CONDITION_NUMERIC);
  72                                         tokens.addElement(new Token(Token.NUMERIC, value.toString(), offset, offset + value.length()));
  73                                 // Adds COMMENT token if two dashes (or SYMBOL (dash) if only one dash);
  74                                 } else if (c == CHAR_DASH) {
  75                                         if (p.isDone()) {
  76                                                 tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1));
  77                                         } else {
  78                                                 char next = p.peek();
  79                                                 if (next == CHAR_DASH) {
  80                                                         StringBuffer value = AddTokenUntil(p, CHAR_DASH, CONDITION_EOL);
  81                                                         tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length()));
  82                                                 } else {
  83                                                         tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1));
  84                                                 }
  85                                         }
  86                  //     Determine if the ';' is escaped or not
  87                                 } else if (c == CHAR_ESCAPE) {
  88                                         if (p.peek() == CHAR_SEPARATOR) {
  89                                                 p.getNext();    // We advance the pointer so the separator is not marked again
  90                                                 // We DON´T SAVE the scape character in the tokens.
  91                                                 // For correct sintax highlighting we set the offset to +2
  92                                                 // This is so far the only case when a character is eliminated and not saved to the tokens.
  93                                                 // That means it won´t be sent to the database when executed.
  94                                                 // This is to allow definitions of procedures with ';' as an end-of-sentence,
  95                                                 //  not as an execution symbol for SQL.
  96                                                 tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_SEPARATOR).toString() , offset, offset + 2));
  97                                         }       else {
  98                                                 tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_ESCAPE).toString() , offset, offset + 1));
  99                                         }
 100                                 // Adds SEPARATOR token (;),  considers the rest of the line as COMMENT token;
 101                                 } else if (c == CHAR_SEPARATOR) {
 102                                         tokens.addElement(new Token(Token.SEPARATOR, new Character(CHAR_SEPARATOR).toString(), offset, offset + 1));
 103                                         // The rest of the line will be a comment
 104                                         if (!p.isDone()) {
 105                                                 StringBuffer value = AddTokenUntil(p, "", CONDITION_EOL);
 106                                                 //      We add to the offset so as to skip the initial ';'
 107                                                 offset++;
 108                                                 tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length()));
 109                                         }
 110                                 // Adds COMMENT token, for several lines;
 111                                 } else if (c == '/') {
 112                                         // If we have '/*', it's a comment till '*/' found or eof
 113                                         if (p.peek() == '*') {
 114                                                 tokens.addElement(tokenizeComment(p, offset));
 115                                         } else {
 116                                                 tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}) , offset, offset + 1));
 117                                         }
 118                                 // Adds SYMBOL token;
 119                                 } else {
 120                                         tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}), offset, offset + 1));
 121                                 }
 122                         }
 123                 } catch (RuntimeException e) {
 124                         e.printStackTrace();
 125                 }
 126
 127 //              System.out.println("-------------------");
 128 //              for (int i = 0; i < tokens.size(); i++) {
 129 //                      System.out.println((Token) tokens.elementAt(i));
 130 //              }
 131                 return tokens;
 132         }
 133         /**
 134          * Searchs for a token end, UNTIL the condition is true, or a newline, or the end of the StringPointer
 135          * The end character is also addedd to the StringBuffer
 136          * @param p
 137          * @param s A string with the first character from the token, already extracted from the StringPointer
 138          * @param condition
 139          * @return a StringBuffer with the complete token
 140          */
 141         private static StringBuffer AddTokenUntil(StringPointer p, String s, int condition) {
 142                 StringBuffer value = new StringBuffer(s);
 143                 if (p.isDone()) return value;
 144                 for(;;) {
 145                         char c = p.getNext();
 146                         if (c != CHAR_EOL) value.append(c);
 147                         if (CheckCondition (c, condition) || c == CHAR_EOL || p.isDone()) {
 148                                 break;
 149                         }
 150                 }
 151                 return value;
 152         }
 153         private static StringBuffer AddTokenUntil(StringPointer p, char c, int condition) {
 154                 return AddTokenUntil(p, new Character(c).toString(), condition);
 155         }
 156         /**
 157          * Searchs for a token end, WHILE the condition is true, or the end or the StringPointer.
 158          * @param p             The StringPointer where the original stream is
 159          * @param s             A string with the first character from the token, already extracted from the StringPointer
 160          * @param condition     The condition to end the token
 161          * @return a StringBuffer with the complete token
 162          */
 163         private static StringBuffer AddTokenWhile(StringPointer p, String s, int condition) {
 164                 StringBuffer value = new StringBuffer(s);
 165                 if (p.isDone()) return value;
 166                 for(;;) {
 167                         char c = p.getNext();
 168                         if (CheckCondition (c, condition)) {
 169                                 value.append(c);
 170                                 if (p.isDone()) break;
 171                         }
 172                         else
 173                         {
 174                                 p.back();
 175                                 break;
 176                         }
 177                 }
 178                 return value;
 179         }
 180         private static StringBuffer AddTokenWhile(StringPointer p, char c, int condition) {
 181                 return AddTokenWhile(p, new Character(c).toString(), condition);
 182         }
 183         /**
 184          * Returns true if the character meets the condition, and false if not.
 185          * New conditions should be defined in this function
 186          * @param c     The character to check the condition
 187          * @param condition The condition to check
 188          * @return
 189          */
 190         private static boolean CheckCondition(char c, int condition) {
 191                 switch (condition) {
 192                 case CONDITION_WHITESPACE:
 193                         return Character.isWhitespace(c);
 194                 case CONDITION_IDENTIFIER_INITIAL:
 195                         return (Character.isLetter(c) || c == '$' || c == '#');
 196                 case CONDITION_IDENTIFIER:
 197                         return (Character.isLetter(c) || Character.isDigit(c) || c == '_' || c == '$' || c == '#');
 198                 case CONDITION_LITERAL_SIMPLE_QUOTE:
 199                         return (c == '\'');
 200                 case CONDITION_LITERAL_DOUBLE_QUOTE:
 201                         return (c == '\"');
 202                 case CONDITION_NUMERIC:
 203                         return (Character.isDigit(c) || c == '.');
 204                 case CONDITION_EOL:
 205                         return (c == CHAR_EOL);
 206                 default:
 207                         break;
 208                 }
 209                 return false;
 210         }
 211         /**
 212          * @param tokens
 213          * @param p
 214          * @param offset
 215          */
 216         private static Token tokenizeComment(StringPointer p, int offset) {
 217                 char c;
 218                 StringBuffer value = new StringBuffer();
 219                 c = p.getNext();
 220                 value.append('/');
 221                 while (!( c == '*' && p.peek() == '/' ) && !p.isDone()) {
 222                         value.append(c);
 223                         c = p.getNext();
 224                 }
 225                 if (!p.isDone()){
 226                         value.append(c);
 227                         c = p.getNext();
 228                         value.append(c);
 229                 }
 230                 return new Token(Token.COMMENT, value.toString(), offset, offset + value.length());
 231         }
 232 }