这是原始方法 - 我获取全文并使用split() 分隔部分。
首先我使用split('Container:container_') 来拆分容器。
接下来在每个容器中我使用split('LogType') 分隔strerr、stdout 等。
在每个stderr 中,我使用Split('Log') 分隔LogType、Log Upload、LogLenght、Log Contents。
最后在每个Log Contents 中,我使用split('20/') 将每条消息拆分为单独的字符串。这部分将使用regex,因为明年需要21/ 而不是20/
我还在前三个空格上拆分每条消息,以获得分开的数据、时间、消息类型和消息文本。见最后结果。
顺便说一句:
我只使用json 来显示它。它提供了比pprint (PrettyPrinter) 更好的结果。
我正在考虑同时保留stdout 和其他LogType。
text = '''Container:container_12345
=============================
LogType:container
Log Upload Time :Thu Jun 25 12:24:45 +0100 2020
LogLength:0
Log Contents:
LogType:stderr
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:3000
Log Contents:
20/06/25 12:19:39 ERROR Exception found:
java.io.Exception:Not initated
at.apache.java.org........
20/06/25 12:19:40 INFO executor.EXECUTOR
20/06/25 12:20:41 WARN Warning as the node is accessed without started
LogType:stdout
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:0
Log Contents:
Container:container_e182_1234
=============================
LogType:container-localizer-syslog
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:0
Log Contents:
LogType:stderr
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:3000
Log Contents:
LogType:stdout
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:0
Log Contents:'''
#text = open(...).read()
all_containers = text.split('Container:container_')
results = []
for one_container in all_containers[1:]: # use [1:] to skip text before first `Container:`
#print('\n--- Container ----\n')
#print('Container:container_' + item)
one_result = dict()
#one_result['id'] = one_container[:one_container.find('\n')]
one_result['id'] = one_container.split('\n', 1)[0]
all_types = one_container.split('LogType:')
for one_type in all_types[1:]: # use [1:] to skip text before first `LogType:`
#print('\n--- LogType ----\n')
#print('LogType:' + one_type)
if one_type.startswith('stderr'):
for one_log in one_type.split('Log')[1:]:
#print('LOG>', one_log.strip())
key, val = one_log.split(':', 1)
key = key.strip()
val = val.strip()
if key == 'Contents':
messages = ['20/'+x for x in val.split('20/')[1:]]
one_result[key] = []
for msg in messages:
parts = msg.split(' ', 3)
if parts[2] in ("ERROR", "WARN"):
one_result[key].append({
'date': parts[0],
'time': parts[1],
'type': parts[2],
'text': parts[3].strip(),
})
elif key == 'Length':
one_result[key] = int(val)
else:
one_result[key] = val
results.append(one_result)
import json
print(json.dumps(results, indent=2))
#import pprint
#pprint.pprint(results)
结果:
[
{
"id": "12345",
"Upload Time": "Thu Jun 25 12:24:52 +0100 2020",
"Length": 3000,
"Contents": [
{
"date": "20/06/25",
"time": "12:19:39",
"type": "ERROR",
"text": "Exception found:\njava.io.Exception:Not initated\n at.apache.java.org........"
},
{
"date": "20/06/25",
"time": "12:20:41",
"type": "WARN",
"text": "Warning as the node is accessed without started"
}
]
},
{
"id": "e182_1234",
"Upload Time": "Thu Jun 25 12:24:52 +0100 2020",
"Length": 3000,
"Contents": []
编辑:
使用re.split() 和^ 仅匹配行首的项目的版本。
我也用
re.split('^(?=\d+/)', val, flags=re.MULTILINE)
在任何日期拆分。我使用(?=...) (lookahead) 来保持这个值的日期。
text = '''Container:container_12345
=============================
LogType:container
Log Upload Time :Thu Jun 25 12:24:45 +0100 2020
LogLength:0
Log Contents:
LogType:stderr
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:3000
Log Contents:
20/06/25 12:19:39 ERROR Exception found: Log
java.io.Exception:Not initated
at.apache.java.org........
20/06/25 12:19:40 INFO executor.EXECUTOR
20/06/25 12:20:41 WARN Warning as the node is accessed without started
LogType:stdout
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:0
Log Contents:
Container:container_e182_1234
=============================
LogType:container-localizer-syslog
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:0
Log Contents:
LogType:stderr
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:3000
Log Contents:
LogType:stdout
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:0
Log Contents:'''
import re
# text = open(...).read()
results = []
all_containers = re.split('^Container:container_', text, flags=re.MULTILINE)
print(all_containers)
for one_container in all_containers[1:]: # use [1:] to skip text before first `Container:`
#print('\n--- Container ----\n')
#print('Container:container_' + item)
one_result = dict()
#one_result['id'] = one_container[:one_container.find('\n')]
one_result['id'] = one_container.split('\n', 1)[0]
#all_types = one_container.split('LogType:')
all_types = re.split('^LogType:', one_container, flags=re.MULTILINE)
for one_type in all_types[1:]: # use [1:] to skip text before first `LogType:`
#print('\n--- LogType ----\n')
#print('LogType:' + one_type)
if one_type.startswith('stderr'):
#all_logs = one_type.split('Log')
all_logs = re.split('^Log', one_type, flags=re.MULTILINE)
for one_log in all_logs[1:]:
key, val = one_log.split(':', 1)
key = key.strip()
val = val.strip()
if key == 'Contents':
# create list for all messages
#one_result[key] = []
one_result['Contents'] = []
# split on `20/` (and keep `20/` using `lookahead` `(?=...)`)
messages = re.split('^(?=\d+/)', val, flags=re.MULTILINE)
messages = messages[1:]
# split on `20/`
#messages = val.split('20/')
# add back `20/`
#messages = ['20/'+x for x in messages]
#print('>>>', '>>> '.join(messages))
# work with every message separatelly
for msg in messages:
# split message on date, time, type, message
parts = msg.split(' ', 3)
# filter by `type`
if parts[2] in ("ERROR", "WARN"):
# add to list as dictionary
one_result['Contents'].append({
'date': parts[0],
'time': parts[1],
'type': parts[2],
'text': parts[3].strip(),
})
elif key == 'Length':
one_result[key] = int(val)
else:
one_result[key] = val
results.append(one_result)
import json
print(json.dumps(results, indent=2))
#import pprint
#pprint.pprint(results)
编辑: 代码拆分为函数以使其更具可读性。我还添加了line_start、line_end。
它可以解析所有数据并为您提供 python 数据,您可以稍后使用这些数据来搜索信息。或者您可以使用参数filter_log_types 和filter_content_types 仅获取选定的信息。
text = '''Container:container_12345
=============================
LogType:container
Log Upload Time :Thu Jun 25 12:24:45 +0100 2020
LogLength:0
Log Contents:
LogType:stderr
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:3000
Log Contents:
20/06/25 12:19:39 ERROR Exception found: Log
java.io.Exception:Not initated
at.apache.java.org........
20/06/25 12:19:40 INFO executor.EXECUTOR
20/06/25 12:20:41 WARN Warning as the node is accessed without started
LogType:stdout
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:0
Log Contents:
Container:container_e182_1234
=============================
LogType:container-localizer-syslog
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:0
Log Contents:
LogType:stderr
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:3000
Log Contents:
LogType:stdout
Log Upload Time :Thu Jun 25 12:24:52 +0100 2020
LogLength:0
Log Contents:
'''
import re
def parse_file(text, filter_log_types=None, filter_content_types=None):
"""
filter_log_types - it has to be list or tuple
filter_content_types - it has to be list or tuple
"""
full_text_lines = text.split('\n') # to get line number in `parse_log_contents`
results = []
all_containers = re.split('^Container:container_', text, flags=re.MULTILINE)
#print('\n--- container ---\n'.join(all_containers))
for item in all_containers[1:]: # use [1:] to skip text before first `Container:`
data = parse_container(item, full_text_lines, filter_log_types, filter_content_types)
results.append(data)
return results
def parse_container(text, full_text_lines, filter_log_types=None, filter_content_types=None):
results = {}
first, rest = text.split('\n', 1)
results['id'] = first
all_log_types = re.split('^(?=LogType:)', rest, flags=re.MULTILINE)
#print('\n--- logtype ---\n'.join(all_log_types))
for item in all_log_types[1:]: # use [1:] to skip text before first `LogType:`
data = parse_log(item, full_text_lines, filter_content_types)
log_type = data['type']
if not filter_log_types or (log_type in filter_log_types):
results[log_type] = data
return results
def parse_log(text, full_text_lines, filter_content_types=None):
results = {}
all_log_items = re.split('^Log', text, flags=re.MULTILINE)
#print('\n--- all_log_item ---\n'.join(all_log_items))
for item in all_log_items[1:]:
key, val = item.split(':', 1)
key = key.strip().lower()
val = val.strip()
if key == 'contents':
results[key] = parse_log_contents(val, full_text_lines, filter_content_types)
elif key == 'length':
results[key] = int(val)
else:
results[key] = val
return results
def parse_log_contents(text, full_text_lines, filter_content_types=None):
# create list for all messages
results = []
# split on `20/` (and keep `20/` using `lookahead` `(?=...)`)
messages = re.split('^(?=\d+/)', text, flags=re.MULTILINE)
messages = messages[1:]
# work with every message separatelly
for msg in messages:
# split message on date, time, type, message
parts = msg.split(' ', 3)
if len(parts) < 3:
print('WARNING: Not enough parts in message (needs 3 but get {}): {}'.format(len(parts), msg))
else:
msg_date = parts[0]
msg_time = parts[1]
msg_type = parts[2]
msg_text = parts[3].strip()
msg_lines = msg.split('\n')
first_line = msg_lines[0]
msg_line_start = full_text_lines.index(first_line)
msg_line_end = msg_line_start + len(msg_lines) - 1
if not filter_content_types or (msg_type in filter_content_types):
results.append({
'date': msg_date,
'time': msg_time,
'type': msg_type,
'text': msg_text,
'line_start': msg_line_start,
'line_end': msg_line_end,
})
return results
# --- main ----
if __name__ == '__main__':
# text = open(...).read()
# parse_file(text, filter_log_types=None, filter_content_types=None):
#results = parse_file(text)
results = parse_file(text, ['stderr'], ['ERROR', 'WARN'])
import json
print(json.dumps(results, indent=2))
#import pprint
#pprint.pprint(results)