要拥有比正则表达式和手写词法分析器更强大的东西,您可以使用类似 Flex 的工具来创建一个。我在 JFlex+CUP(http://jflex.de/manual.html、http://www2.cs.tum.edu/projects/cup/install.php)中创建了一个简单的解析器,用于解析您提供的文本。
首先,您需要创建 .flex 文件来描述生成令牌的规则:
import java_cup.runtime.*;
import java.util.*;
%%
%unicode
%class LexicalAnalyzer
%line
%column
%cup
/*numbers*/
number = ([1-9][0-9]*| 0)([.][0-9]+ )?([eE]([+]|[-])?[0-9]+)?
digit = [0-9]
underscore = [_]
identifier = {identifier5} ( [.] {identifier5} )*
identifier2 = {letter} ({letter}|{digit}|{underscore})*
identifier3 = {digit} ({letter}|{digit}|{underscore})+
identifier4 = {underscore} ({letter}|{digit}|{underscore})+
identifier5 = {identifier2} | {identifier3} | {identifier4}
letter = {lowercase} | {uppercase}
lowercase = [a-z]
uppercase = [A-Z]
inputchar = [^\r\n]
/*Comments*/
lineterminator = \r | \n | \r\n
simplecomment = "//" {inputchar}* {lineterminator}
blockcomment = "/*" ( [^*]* | "*"+ [^/*] )* "*"+ "/"
%{
private void error(){
System.err.print("Sintax error on line " + (yyline+1));
System.err.println(". Unrecognizable token: \"" + yytext() + "\"");
//System.exit(1);
}
private Symbol processToken(int type, Object value) {
System.out.println("Type: " + type);
System.out.println("Value: " + value );
return new Symbol(type);
}
StringBuffer str = new StringBuffer();
%}
%state STRING
%state END
%%
<YYINITIAL> {
/* string literals */
['] {str.setLength(0);yybegin(STRING);}
"and" { return processToken( sym.AND , yytext()); }
/* number literals */
{number} { return processToken(sym.NUMBER , yytext()); }
/* identifiers */
{identifier} { return processToken( sym.IDENTIFIER , yytext()); }
"(" { return processToken( sym.OPENP , yytext()); }
")" { return processToken( sym.CLOSEP , yytext()); }
"=" { return processToken(sym.EQUALS , yytext()); }
"," { return processToken(sym.COMMA , yytext()); }
"+" { return processToken(sym.OP , yytext()); }
"-" { return processToken(sym.OP , yytext()); }
"*" { return processToken(sym.OP , yytext()); }
"/" { return processToken(sym.OP , yytext()); }
// whitespace and comments
{simplecomment} {/* Do nothing */}
{blockcomment} {}
" "|\t|\n| {lineterminator} {/* Do nothing */}
. {error();}
//. { /*error!*/ }
}
/* literais string */
<STRING> {
['] { yybegin(YYINITIAL); return processToken( sym.STRING , str.toString()); }
\\t { str.append('\t'); }
\\n { str.append('\n'); }
\\r { str.append('\r'); }
\\\" { str.append('\"'); }
\\\\ { str.append('\\'); }
\\['] { str.append('\''); }
\\[0-9][0-9][0-9]
{
String s = yytext().substring(1);
s = "" + ((char) Integer.parseInt(s));
str.append( s );
}
[^\n\r\'\\\t]+ { str.append( yytext() ); }
. { /* malformed string */}
}
<END>{
\n {}
. {}
}
然后,您需要使用 JFlex.jar 编译此解析器规范
java -jar JFlex.jar lexical.flex
它将创建一个名为“LexicalAnalyzer.java”的源文件,您可以使用它根据您的规范将字符串分解为标记。
public class Parser {
public static void main(String[] args) throws Exception {
String str = "look_up_check('US POPULATION', ( 'POPULATION' = 3844829 ) and ('CITY' = 'Los Angeles'))";
ByteArrayInputStream buff = new ByteArrayInputStream(str.getBytes());
LexicalAnalyzer l = new LexicalAnalyzer(buff);
Symbol s = l.next_token();
while(s.sym != sym.EOF){
s = l.next_token();
}
}
}
产生输出:
Type: 5
Value: look_up_check
Type: 3
Value: (
Type: 1
Value: US POPULATION
Type: 9
Value: ,
Type: 3
Value: (
Type: 1
Value: POPULATION
Type: 8
Value: =
Type: 7
Value: 3844829
Type: 4
Value: )
Type: 2
Value: and
Type: 3
Value: (
Type: 1
Value: CITY
Type: 8
Value: =
Type: 1
Value: Los Angeles
Type: 4
Value: )
Type: 4
Value: )
编辑: sym.java 类
public class sym {
public static int STRING = 1;
public static int AND = 2;
public static int OPENP = 3;
public static int CLOSEP = 4;
public static int IDENTIFIER = 5;
public static int OP = 6;
public static int NUMBER = 7;
public static int EQUALS = 8;
public static int COMMA = 9;
public static int EOF = 10;
}