关于编写解析器,我借此机会使用状态机编写自己的:
import sys
ASTERISK = '*'
DEFAULT = 'default'
EOL = '\n'
ESCAPE = '\\'
QUOTE = '"'
SLASH = '/'
class ExtractStrings:
def __init__(self, multiline_string):
self.buffer = multiline_string
self.chars_collected = ''
self.strings = None
def noop(self, ch):
pass
def collect_char(self, ch):
self.chars_collected += ch
def return_string(self, ch):
self.strings.append(self.chars_collected)
self.chars_collected = ''
def parse(self):
self.strings = []
state = {
'start': {
QUOTE: (self.noop, 'in_string'),
SLASH: (self.noop, 'first_slash'),
DEFAULT: (self.noop, 'start'),
},
'in_string': {
QUOTE: (self.return_string, 'start'),
ESCAPE: (self.collect_char, 'escaping'),
DEFAULT: (self.collect_char, 'in_string'),
},
'escaping': {
DEFAULT: (self.collect_char, 'in_string'),
},
'first_slash': {
SLASH: (self.noop, 'line_comment'),
ASTERISK: (self.noop, 'block_comment'),
DEFAULT: (self.noop, 'start'),
},
'line_comment': {
EOL: (self.noop, 'start'),
DEFAULT: (self.noop, 'line_comment'),
},
'block_comment': {
ASTERISK: (self.noop, 'near_comment_block_end'),
DEFAULT: (self.noop, 'block_comment'),
},
'near_comment_block_end': {
SLASH: (self.noop, 'start'),
ASTERISK: (self.noop, 'near_comment_block_end'),
DEFAULT: (self.noop, 'block_comment'),
}
}
current = 'start'
for ch in self.buffer:
default = state[current][DEFAULT]
action, next_state = state[current].get(ch, default)
action(ch)
current = next_state
def __iter__(self):
if self.strings is None:
self.parse()
return iter(self.strings)
if __name__ == '__main__':
with open(sys.argv[1]) as f:
code = f.read()
for string_literal in ExtractStrings(code):
print('"%s"' % string_literal)
它是如何工作的?
状态机定义了不同的状态、在每个状态下要做什么(图中未显示)以及到下一个状态的转换。一旦
状态机被定义(作为嵌套字典),它只是执行状态的动作,读取下一个字符并查找状态机以查看我们应该转换到哪个状态。
状态机是一个嵌套字典。对于外部字典,键是状态名称,值是内部字典。对于内部字典,键是下一个字符,值是 (action, next state) 的元组。