计算机相关专业的差不多都有学过编译原理吧?今天我班门弄刀,也谈谈我自己对编译原理的认识和理解。当然啦,我主要要谈的是编译器的前端的实现,后端的代码生成我目前还没有研究过。

      实现一个编译器有两大步骤:一是词法分析,二是语法分析。应对这两块有很多的工具是可以帮助我们进行这样的工作的(例如:flex、yacc等),但我要说的是怎么完全手工去实现它。语法分析的主要目的是把一个个的字符和字母之类的东西给识别为token,而语法分析的主要作用是去构建分析后的对象或是构建抽象语法树。下面我结合早一段时间开发的JSON的反序列化来聊聊整个的过程。

 

[] chars)
        {
            IList<Token> tokens = new List<Token>();
            StringBuilder sb 
= new StringBuilder();
            CharReader src 
= new CharReader(chars);
            
char c = src.Read();

            readLoop:
            
while (c != '\0')
            {
                
switch (c)
                {
                    
#region 空白字符
                    
case '\t':
                        {
                            c 
= src.Read();
                            
while (c == '\t') { c = src.Read(); }
                            
break;
                        }
                    
case ' ':
                        {
                            c 
= src.Read();
                            
while (c == ' ') { c = src.Read(); }
                            
break;
                        }
                    
case '\r':
                        {
                            c 
= src.Read();
                            
while (c == '\r') { c = src.Read(); }
                            
break;
                        }
                    
case '\n':
                        {
                            c 
= src.Read();
                            
while (c == '\n') { c = src.Read(); }
                            
break;
                        }
                    
#endregion

                    
#region 运算符
                    
case '[':
                        {
                            tokens.Add(
new Token(TokenId.LBracket));
                            c 
= src.Read();
                            
break;
                        }
                    
case ']':
                        {
                            tokens.Add(
new Token(TokenId.RBracket));
                            c 
= src.Read();
                            
break;
                        }
                    
case '{':
                        {
                            tokens.Add(
new Token(TokenId.LCurly));
                            c 
= src.Read();
                            
break;
                        }
                    
case '}':
                        {
                            tokens.Add(
new Token(TokenId.RCurly));
                            c 
= src.Read();
                            
break;
                        }
                    
case ':':
                        {
                            tokens.Add(
new Token(TokenId.Colon));
                            c 
= src.Read();
                            
break;
                        }
                    
case ',':
                        {
                            tokens.Add(
new Token(TokenId.Comma));
                            c 
= src.Read();
                            
break;
                        }
                    
#endregion

                    
#region    字符串
                    
case '@':
                    
case '\'':
                    case '"':
                        {
                            
bool isVerbatim = false;
                            
if (c == '@')
                            {
                                isVerbatim 
= true;
                                c 
= src.Read(); // skip to follow quote
                            }
                            sb.Length 
= 0;
                            
char quote = c;
                            
bool isSingleQuote = (c == '\'');
                            c = src.Read();
                            
while (!src.Eof)
                            {
                                
if (c == '\\' && !isVerbatim) // normal escaped chars
                                {
                                    c 
= src.Read();
                                    
switch (c)
                                    {
                                        
case '\0':
                                            {
                                                
if (src.Eof)
                                                {
                                                    
goto readLoop;
                                                }
                                                
else
                                                {
                                                    sb.Append(
'\0');
                                                    c 
= src.Read();
                                                    
break;
                                                }
                                            }
                                        
case 'a':
                                            {
                                                sb.Append(
'\a');
                                                c 
= src.Read();
                                                
break;
                                            }
                                        
case 'b':
                                            {
                                                sb.Append(
'\b');
                                                c 
= src.Read();
                                                
break;
                                            }
                                        
case 'f':
                                            {
                                                sb.Append(
'\f');
                                                c 
= src.Read();
                                                
break;
                                            }
                                        
case 'n':
                                            {
                                                sb.Append(
'\n');
                                                c 
= src.Read();
                                                
break;
                                            }
                                        
case 'r':
                                            {
                                                sb.Append(
'\r');
                                                c 
= src.Read();
                                                
break;
                                            }
                                        
case 't':
                                            {
                                                sb.Append(
'\t');
                                                c 
= src.Read();
                                                
break;
                                            }
                                        
case 'v':
                                            {
                                                sb.Append(
'\v');
                                                c 
= src.Read();
                                                
break;
                                            }
                                        
case '\\':
                                            {
                                                sb.Append(
'\\');
                                                c 
= src.Read();
                                                
break;
                                            }
                                        
case '\'':
                                            {
                                                sb.Append(
'\'');
                                                c = src.Read();
                                                
break;
                                            }
                                        
case '\"':
                                            {
                                                
// strings are always stored as verbatim for now, so the double quote is needed
                                                sb.Append("\"\"");
                                                c 
= src.Read();
                                                
break;
                                            }
                                        
default:
                                            {
                                                sb.Append(c);
                                                
break;
                                            }
                                    }
                                }
                                
else if (c == '\"')
                                {
                                    c 
= src.Read();
                                    
// two double quotes are escapes for quotes in verbatim mode
                                    if (c == '\"' && isVerbatim)// verbatim escape
                                    {
                                        sb.Append(
"\"\"");
                                        c 
= src.Read();
                                    }
                                    
else if (isSingleQuote)
                                    {
                                        sb.Append(
'\"');
                                    }
                                    
else
                                    {
                                        
break;
                                    }
                                }
                                
else // non escaped
                                {
                                    
if (c == quote)
                                    {
                                        
break;
                                    }
                                    sb.Append(c);
                                    c 
= src.Read();
                                }

                            }

                            
if (c != '\0')
                            {
                                
if (c == quote)
                                {
                                    c 
= src.Read(); // skip last quote
                                }

                                tokens.Add(
new Token(TokenId.String, sb.ToString()));
                            }
                            
break;
                        }
                    
#endregion

                    
#region 数字
                    
case '0':
                    
case '1':
                    
case '2':
                    
case '3':
                    
case '4':
                    
case '5':
                    
case '6':
                    
case '7':
                    
case '8':
                    
case '9':
                    
case '.':
                    
case '+':
                    
case '-':
                        {
                            sb.Length 
= 0;
                            TokenId numKind 
= TokenId.Int;

                            
bool isDouble = false;

                            
if (c == '.')
                            {
                                c 
= src.Read();
                                
if (c < '0' || c > '9')
                                {
                                    
break;
                                }
                                
else
                                {
                                    sb.Append(
'.');
                                    numKind 
= TokenId.Double;
                                    isDouble 
= true;
                                }
                            }
                            
else if (c == '+')
                            {
                                c 
= src.Read();
                            }
                            
else if (c == '-')
                            {
                                sb.Append(
'-');
                                c 
= src.Read();
                            }

                            
bool isNum = true;

                            
while (isNum)
                            {
                                
switch (c)
                                {
                                    
case '0':
                                    
case '1':
                                    
case '2':
                                    
case '3':
                                    
case '4':
                                    
case '5':
                                    
case '6':
                                    
case '7':
                                    
case '8':
                                    
case '9':
                                        {
                                            sb.Append(c);
                                            
break;
                                        }
                                    
case '.':
                                        {
                                            
if (isDouble)
                                            {
                                                numKind 
= TokenId.Double;
                                                tokens.Add(
new Token(numKind, sb.ToString()));
                                                
goto readLoop;
                                            }

                                            c 
= src.Read();
                                            
if (c < '0' || c > '9')
                                            {
                                                tokens.Add(
new Token(numKind, sb.ToString()));
                                                
goto readLoop;
                                            }
                                            
else
                                            {
                                                sb.Append(
'.');
                                                sb.Append(c);
                                                numKind 
= TokenId.Double;
                                                isDouble 
= true;
                                            }
                                            
break;
                                        }
                                    
default:
                                        {
                                            isNum 
= false;
                                            
break;
                                        }
                                }

                                
if (isNum) c = src.Read();
                            }

                            tokens.Add(
new Token(numKind, sb.ToString()));
                            isNum 
= false;
                            
break;
                        }
                    
#endregion

                    
#region 关键字和变量
                    
default:
                        {
                            
switch (c)
                            {
                                
case 'a':
                                
case 'b':
                                
case 'c':
                                
case 'd':
                                
case 'e':
                                
case 'f':
                                
case 'g':
                                
case 'h':
                                
case 'i':
                                
case 'j':
                                
case 'k':
                                
case 'l':
                                
case 'm':
                                
case 'n':
                                
case 'o':
                                
case 'p':
                                
case 'q':
                                
case 'r':
                                
case 's':
                                
case 't':
                                
case 'u':
                                
case 'v':
                                
case 'w':
                                
case 'x':
                                
case 'y':
                                
case 'z':
                                
case 'A':
                                
case 'B':
                                
case 'C':
                                
case 'D':
                                
case 'E':
                                
case 'F':
                                
case 'G':
                                
case 'H':
                                
case 'I':
                                
case 'J':
                                
case 'K':
                                
case 'L':
                                
case 'M':
                                
case 'N':
                                
case 'O':
                                
case 'P':
                                
case 'Q':
                                
case 'R':
                                
case 'S':
                                
case 'T':
                                
case 'U':
                                
case 'V':
                                
case 'W':
                                
case 'X':
                                
case 'Y':
                                
case 'Z':
                                
case '_':
                                    {
                                        sb.Length 
= 0;
                                        sb.Append(c);
                                        c 
= src.Read();
                                        
bool endIdent = false;
                                        
bool possibleKeyword = true;

                                        
while (c != '\0' && !endIdent)
                                        {
                                            
switch (c)
                                            {
                                                
case 'a':
                                                
case 'b':
                                                
case 'c':
                                                
case 'd':
                                                
case 'e':
                                                
case 'f':
                                                
case 'g':
                                                
case 'h':
                                                
case 'i':
                                                
case 'j':
                                                
case 'k':
                                                
case 'l':
                                                
case 'm':
                                                
case 'n':
                                                
case 'o':
                                                
case 'p':
                                                
case 'q':
                                                
case 'r':
                                                
case 's':
                                                
case 't':
                                                
case 'u':
                                                
case 'v':
                                                
case 'w':
                                                
case 'x':
                                                
case 'y':
                                                
case 'z':
                                                    {
                                                        sb.Append(c);
                                                        c 
= src.Read();
                                                        
break;
                                                    }
                                                
case 'A':
                                                
case 'B':
                                                
case 'C':
                                                
case 'D':
                                                
case 'E':
                                                
case 'F':
                                                
case 'G':
                                                
case 'H':
                                                
case 'I':
                                                
case 'J':
                                                
case 'K':
                                                
case 'L':
                                                
case 'M':
                                                
case 'N':
                                                
case 'O':
                                                
case 'P':
                                                
case 'Q':
                                                
case 'R':
                                                
case 'S':
                                                
case 'T':
                                                
case 'U':
                                                
case 'V':
                                                
case 'W':
                                                
case 'X':
                                                
case 'Y':
                                                
case 'Z':
                                                
case '_':
                                                
case '0':
                                                
case '1':
                                                
case '2':
                                                
case '3':
                                                
case '4':
                                                
case '5':
                                                
case '6':
                                                
case '7':
                                                
case '8':
                                                
case '9':
                                                    {
                                                        possibleKeyword 
= false;
                                                        sb.Append(c);
                                                        c 
= src.Read();
                                                        
break;
                                                    }
                                                
default:
                                                    {
                                                        endIdent 
= true;
                                                        
break;
                                                    }
                                            }
                                        }
                                        
string identText = sb.ToString();
                                        sb.Length 
= 0;
                                        
bool isKeyword = possibleKeyword ? _keyWords.ContainsKey(identText) : false;
                                        
if (isKeyword)
                                        {
                                            tokens.Add(
new Token(_keyWords[identText]));
                                        }
                                        
else
                                        {
                                            tokens.Add(
new Token(TokenId.Ident, identText));
                                        }
                                        
break;
                                    }
                                
default:
                                    {
                                        tokens.Add(
new Token(TokenId.Invalid));
                                        c 
= src.Read();
                                        
break;
                                    }
                            }
                            
break;
                        }
                    
#endregion
                }
            }

            
return tokens;
        }

 

      关于JSON的定义请参看http://www.json.org/,要实现JSON的分析,我们要确定其基本的定义,而它的定义相对比较简单,它只有 Object、Array、String、Number和Value五种形式。而根据这几种对象的表示的边界条件,我们知道我们要分析的字符主要有:'{', '}', '[', ']', ':', ',', '"'等,对这些的分析的实现代码如上。

  完整代码如下:/Files/afxcn/MiniJSON.rar

下一篇:

  谈谈编译原理和其在WEB开发中的应用2

相关文章:

  • 2022-12-23
  • 2021-12-04
  • 2022-12-23
  • 2021-06-27
  • 2022-01-02
  • 2022-01-01
猜你喜欢
  • 2022-12-23
  • 2022-12-23
  • 2021-08-19
  • 2021-12-17
  • 2021-11-27
相关资源
相似解决方案