python解析FreeMind和XMind思维导图
记录瞬间
在实际工作中,通常需要使用思维导图进行一些分析和设计,但是,在设计好之后,想要把思维导图的内容转化成文字进行输出怎么做呢?
使用python(当然可以使用其他的语言进行处理)可以很好的解决这个问题。
代码如下:
# coding:utf-8 import os from html.parser import HTMLParser def analyse_mm_file(mm_file): if os.path.isfile(mm_file): num = 1 # 记录行号的标记 point = 0 # 记录叶子节点的标记 1 是根节点 mark_node = 0 # 记录节点信息标记 mark_note = 0 # 记录备注信息标记 flow = "" # 记录流程信息 with open(mm_file) as f: lines = f.readlines() for line in lines: line = line.rstrip(\'\n\') if mark_node == 1 or mark_note == 1: with open("temp.html", mode="a", encoding="UTF-8") as f: if line.rfind(\'<richcontent TYPE="NODE"><html>\') != -1 or \ line.rfind(\'<richcontent TYPE="NOTE"><html>\') != -1: f.write("<html>\n") num += 1 elif line.rfind(\'</html>\') != -1: f.write("</html>\n") num += 1 elif line.rfind(\'</richcontent>\') != -1: num += 1 elif line.rfind(\'</node>\') != -1: point -= 1 if mark_node == 1: mark_node = 2 if mark_note == 1: mark_note = 2 num += 1 else: f.write(line + "\n") num += 1 continue if mark_node == 2 or mark_note == 2: data = analyse_html("./temp.html") print("data = ", data) os.remove("./temp.html") if mark_node == 2: # 操作 node 节点信息 for i in range(len(data)): result = data[i].replace(\'\n\', \'\') print(result) if mark_note == 2: # 操作 note 备注信息 for i in range(len(data)): result = data[i].replace(\'\n\', \'\') print(result) if mark_node != 0: mark_node = 0 if mark_note != 0: mark_note = 0 if line.rfind(\'<map version="1.0.1">\') == 0 and num == 1: num += 1 if line.rfind(\'</map>\') == 0: print("解析文件完成!共解析 {} 行。".format(num)) elif line.rfind(\'</node>\') == 0: point -= 1 num += 1 elif line.rfind(\'<node \') == 0: point += 1 if line.rfind(\'" TEXT="\') != -1 and line[-2:] == \'">\': start_num = line.rfind(\'" TEXT="\') + 8 print("start num = ", start_num) get_value = get_chinese(line[start_num: len(line) - 2]) print(get_value) elif line.rfind(\'" TEXT="\') != -1 and line[-2:] == \'/>\': start_num = line.rfind(\'" TEXT="\') + 8 print("start num = ", start_num) get_value = get_chinese(line[start_num: len(line) - 3]) print(get_value)
point -= 1
if line.rfind(\'" TEXT="\') == -1: mark_node = 1 # 存在 HTML 网页 num += 1 if len(flow) == 0: flow = "{}".format(point) else: if point == int(flow.split("_")[len(flow.split("_")) - 1]): pass else: if point < int(flow.split("_")[len(flow.split("_")) - 1]): flow = flow.split(str(point))[0] + str(point) else: flow = "{}_{}".format(flow, point) print("总体的线性流程:", flow) elif line.rfind(\'<richcontent TYPE="NOTE"><html>\') == 0: with open("temp.html", mode="a", encoding="UTF-8") as f: f.write(\'<html>\n\') mark_note = 1 # 存在备注信息 elif line.rfind(\'<icon \') == 0: print(line) num += 1 elif line.rfind(\'<arrowlink \') == 0: # 箭头指向,可以实现关联 print(line) num += 1 elif line.rfind(\'<hook \') == 0: print(line) num += 1 elif line.rfind(\'<text>\') == 0: # point = point + 1 print(line) num += 1 elif line.rfind(\'</hook>\') == 0: print(line) num += 1 elif line.rfind(\'<cloud/>\') == 0: print(line) num += 1 elif line.rfind(\'<font \') == 0: print(line) num += 1 elif line.rfind(\'<edge \') == 0: print(line) num += 1 else: num += 1 else: print("系统中没有找到没有FreeMind文件。{}".format(mm_file)) exit() def analyse_html(file_path): with open(file=file_path, mode="r", encoding="UTF-8") as f: page = f.read() html_parser = HP() html_parser.feed(page) html_parser.close() return html_parser.data def get_chinese(line): get_word = "" array = line.split("&#x") flag = True if line.find("&#x") != -1: for i in range(len(array)): # 遍历数组 if len(array[i]) == 0 and flag: # 第一个值为空时,继续循环 flag = False continue if array[i][4:5] == ";": # 解析Unicode字符 unicode = "\\u" + array[i][:4] get_word = get_word + unicode.encode(\'latin-1\').decode(\'unicode_escape\') + array[i][5:] elif array[i][:2] == "a;": # 换行转义 get_word = get_word + "\n" + array[i][2:] else: get_word = get_word + array[i] return get_word else: return line.replace(\'&\', \'&\') class HP(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.tag_text = False self.data = [] def handle_starttag(self, tag, attr): if tag == \'p\' or tag == \'li\': self.tag_text = True if tag == \'img\' and self._attr(attr, \'src\'): self.data.append("img:{}".format(self._attr(attr, \'src\'))) def handle_endtag(self, tag): if tag == \'p\' or tag == \'li\': self.tag_text = False def handle_data(self, data): if self.tag_text: self.data.append(get_chinese(data)) def _attr(self, attr_list, attr_name): for attr in attr_list: if attr[0] == attr_name: return attr[1] return None analyse_mm_file("./mm/思维导图.mm")
由于xmind可以通过导出,选择格式为freemind,但是导出的结果是 0.8.1 版本的freemind结果,而且会将所有内容呈现在一行中
所以需要对xmind导出的结果进行简要的修改
import uuid file_path = r\'/path/to/mm/file.mm\' with open(file=file_path, mode=\'r\', encoding=\'utf-8\') as f: content = f.readlines() for line in content: if \'<map version="0.8.1">\' in line: get_str = str(content).rstrip("\']").lstrip("[\'").replace(\'"><\', \'">\n<\')\ .replace(\'></\', \'>\n</\').replace("\\n\', \'", "\\n")\ .replace(\'</node><node\', \'</node>\n<node\').replace(\'"/><node\', \'"/>\n<node\') elif \'<map version="1.0.1">\' in line: get_str = "" break file_name = r\'./temp-\' + str(uuid.uuid1()).replace(\'-\', \'\') + \'.txt\' with open(file=file_name, mode=\'a\', encoding=\'UTF-8\') as f: print(get_str, file=f, flush=True) print(get_str)
生活和学习都是层层递进的,在经历了,痛苦的解析之后,发现可以直接通过解析xml文件的方式,进行解析最终结果
少啰嗦,上代码:
import zipfile import os import io import sys import hashlib import xml.etree.ElementTree as ET class AnalyseMindMap: def __init__(self, file_path, mark): self.file_path = file_path self.context = "" if mark == "": self.mark = " " elif mark == "#": self.mark = "#" elif mark == "*": self.mark = "*" # 解析xmind数据 def analyse_xmind(self): file_name = os.path.basename(self.file_path) if os.path.isfile(self.file_path): base_dir = os.path.dirname(self.file_path) m = hashlib.md5() file = io.FileIO(self.file_path, \'r\') read_bytes = file.read(1024) while read_bytes != b\'\': m.update(read_bytes) read_bytes = file.read(1024) file.close() md5value = m.hexdigest() for dir_name in os.listdir(base_dir): if dir_name == md5value: print(\'已经存在了该文件\', md5value) continue file_zip = zipfile.ZipFile(self.file_path, \'r\') for file in file_zip.namelist(): file_zip.extract(file, base_dir + \'/\' + md5value) file_zip.close() xml_file = os.path.join(base_dir, md5value, \'content.xml\') return self.analyse_xml(xml_file) else: return "{} 不存在".format(file_name) # file_list = os.listdir(self.file_path) # # for file_name in file_list: # print(file_name) # if os.path.splitext(file_name)[1] == \'.xmind\': # print(file_name) # zip_file = os.path.join(self.file_path, file_name) # m = hashlib.md5() # file = io.FileIO(zip_file, \'r\') # read_bytes = file.read(1024) # while read_bytes != b\'\': # m.update(read_bytes) # read_bytes = file.read(1024) # file.close() # md5value = m.hexdigest() # for dir_name in os.listdir(r\'Upload/\'): # if dir_name == md5value: # print(\'已经存在了该文件\', md5value) # continue # file_zip = zipfile.ZipFile(zip_file, \'r\') # for file in file_zip.namelist(): # file_zip.extract(file, r\'./\' + md5value) # file_zip.close() # xml_file = os.path.join(\'./\', md5value, \'content.xml\') # self.analyse_xml(xml_file) # os.remove(file_name) # 解析xml文件 def analyse_xml(self, xml_file): try: tree = ET.parse(xml_file) # 获得根节点 root = tree.getroot() except Exception as e: # 捕获除与程序退出sys.exit()相关之外的所有异常 print("parse test.xml fail!") sys.exit() pre_tag = \'{\' + root.tag.split(\'{\')[1].split(\'}\')[0] + \'}\' title_path = pre_tag + \'sheet/\' + pre_tag + \'topic/\' + pre_tag + \'title\' print("h1.", root.find(title_path).text) self.context = "\nh1. " + root.find(title_path).text + "\n" plain_path = pre_tag + \'sheet/\' + pre_tag + \'topic/\' + pre_tag + \'notes/\' + pre_tag + \'plain\' if root.find(plain_path) is not None: print("<pre>备注:" + root.find(plain_path).text + "</pre>") self.context += "<pre>备注:" + root.find(plain_path).text + "</pre>" + "\n\n" third_path = pre_tag + \'sheet/\' + pre_tag + \'topic/\' + pre_tag + \'children\' num = 1 # 对分层进行标记 for first_topic in root.findall(third_path): self.recursive_xml(first_topic, pre_tag, num) return self.context # 递归调用获取元素值 def recursive_xml(self, root, pre_tag, num): topics_path = pre_tag + \'topics\' # 一个children下面可能会有多个topics,所以需要循环一下 for topics in root.findall(topics_path): topic_path = pre_tag + \'topic\' for topic in topics.findall(topic_path): title_path = pre_tag + \'title\' if num > 1: print(self.mark * (num - 1), topic.find(title_path).text) self.context += self.mark * (num - 1) + " " + topic.find(title_path).text + "\n" else: print("h3.", topic.find(title_path).text) self.context += "\nh3. " + topic.find(title_path).text + \'\n\n\' plain_path = pre_tag + \'notes/\' + pre_tag + \'plain\' if topic.find(plain_path) is not None: print("<pre>备注:"+topic.find(plain_path).text + "</pre>") self.context += "<pre>备注:"+topic.find(plain_path).text+"</pre>" + "\n\n" label_path = pre_tag + \'labels/\' + pre_tag + \'label\' if topic.find(label_path) is not None: print("-->标签:", topic.find(label_path).text + "<--") self.context += "-->标签:" + topic.find(label_path).text + "<--" + "\n\n" children_path = pre_tag + \'children\' for new_topic in topic.findall(children_path): self.recursive_xml(new_topic, pre_tag, num+1) # 解析freemind的xml文件 def analyse_mm_xml(self): if os.path.isfile(self.file_path): try: tree = ET.parse(self.file_path) # 获得根节点 root = tree.getroot() except Exception as e: # 捕获除与程序退出sys.exit()相关之外的所有异常 print("parse test.xml fail!") sys.exit() node_path = "node" num = 1 for node in root.findall(node_path): print("h1.", node.attrib[\'TEXT\']) self.context += "\nh1. " + node.attrib[\'TEXT\'] + "\n" if node.find(\'richcontent\') is not None: context_p = \'richcontent/html/body/p\' print(\'<pre>备注:\', node.find(context_p).text.replace(\' \', \'\').replace(\'\n\', \'\'), \'</pre>\') self.context += \'<pre>备注:\' + node.find(context_p).text.replace(\' \', \'\').replace(\'\n\', \'\') + \'</pre>\' + "\n\n" self.recursive_node(node, num) return self.context # 递归运行查看结果 def recursive_node(self, root, num): node_path = \'node\' richcontent_path = \'richcontent\' for node in root.findall(node_path): if \'TEXT\' in node.attrib: if num > 1: print(self.mark * (num - 1), node.attrib[\'TEXT\']) self.context += self.mark * (num - 1) + \' \' + node.attrib[\'TEXT\'] + "\n" else: print("h3.", node.attrib[\'TEXT\']) self.context += "\nh3. " + node.attrib[\'TEXT\'] + "\n\n" if node.find(richcontent_path) is not None: context_p = \'richcontent/html/body/p\' context_out = \'\' for p in node.findall(context_p): context_out += p.text.replace(\' \', \'\').replace(\'\n\', \'\') + \'\n\' print(\'<pre>备注:\', context_out, \'</pre>\') self.context += \'<pre>备注:\' + context_out + \'</pre>\' + "\n\n" if node.find(node_path) is not None: self.recursive_node(node, num + 1) if __name__ == \'__main__\': file_path = r\'C:\path\to\file.xmind\' amm = AnalyseMindMap(file_path, "#") amm.analyse_xmind() file_path = r\'C:\path\to\file.mm\' amm = AnalyseMindMap(file_path, "#") amm.analyse_mm_xml()
================我是底线================