我知道已经有一个公认的答案,但我使用的方法实际上可以帮助您找到文件中的格式错误,而不仅仅是忽略额外的位:
from tokenize import TokenInfo, tokenize, ENCODING, ENDMARKER, NEWLINE, NAME
from typing import Callable, Generator
class TripParseException(Exception):
pass
def assert_token_string(token:TokenInfo, expected_string: str):
if token.string != expected_string:
raise TripParseException("Unable to parse trip file: expected {}, found {} in line {} ({})".format(
expected_string, token.string, str(token.start[0]), token.line
))
def assert_token_type(token:TokenInfo, expected_type: int):
if token.type != expected_type:
raise TripParseException("Unable to parse trip file: expected type {}, found type {} in line {} ({})".format(
expected_type, token.type, str(token.start[0]), token.line
))
def parse_destinations(token_stream: Generator[TokenInfo, None, None])->dict:
destinations = dict()
assert_token_string(next(token_stream), "DESTINATIONS")
assert_token_string(next(token_stream), "BEGIN")
assert_token_type(next(token_stream), NEWLINE)
current_token = next(token_stream)
while(current_token.string != "DESTINATIONS"):
assert_token_type(current_token, NAME)
destination = current_token.string
plane_codes = list()
current_token = next(token_stream)
while(current_token.type != NEWLINE):
assert_token_type(current_token, NAME)
plane_codes.append(current_token.string)
current_token = next(token_stream)
destinations[destination] = plane_codes
# current token is NEWLINE, get the first token on the next line.
current_token = next(token_stream)
# Just parsed "DESTINATIONS", expecting "DESTINATIONS END"
assert_token_string(next(token_stream), "END")
assert_token_type(next(token_stream), NEWLINE)
return destinations
def parse_trip(token_stream: Generator[TokenInfo, None, None]):
current_token = next(token_stream)
if(current_token.type == ENDMARKER):
return None, None
assert_token_string(current_token, "SOURCE")
assert_token_string(next(token_stream), ":")
tok_origin = next(token_stream)
assert_token_type(tok_origin, NAME)
assert_token_type(next(token_stream), NEWLINE)
destinations = parse_destinations(token_stream)
return tok_origin.string, destinations
def parse_trips(readline: Callable[[], bytes]) -> dict:
token_gen = tokenize(readline)
assert_token_type(next(token_gen), ENCODING)
trips = dict()
while(True):
origin, destinations = parse_trip(token_gen)
if(origin is not None and destinations is not None):
trips[origin] = destinations
else:
break
return trips
那么您的实现将如下所示:
import pprint
with open("trips.dat", "rb") as trips_file:
trips = parse_trips(trips_file.readline)
pprint.pprint(
trips
)
产生预期结果:
{'RCM': {'JCK': ['SF3']}, 'TRO': {'GFN': ['SF3'], 'SYD': ['SF3', 'DH4']}}
如果您最终想稍后将其他信息放入文件中,这也会更加灵活。