python解析FreeMind和XMind思维导图

记录瞬间

在实际工作中，通常需要使用思维导图进行一些分析和设计，但是，在设计好之后，想要把思维导图的内容转化成文字进行输出怎么做呢？

使用python（当然可以使用其他的语言进行处理）可以很好的解决这个问题。

代码如下：

# coding:utf-8
import os
from html.parser import HTMLParser


def analyse_mm_file(mm_file):
    if os.path.isfile(mm_file):
        num = 1         # 记录行号的标记
        point = 0       # 记录叶子节点的标记 1 是根节点
        mark_node = 0   # 记录节点信息标记
        mark_note = 0   # 记录备注信息标记
        flow = ""       # 记录流程信息
        with open(mm_file) as f:
            lines = f.readlines()
            for line in lines:
                line = line.rstrip(\'\n\')
                if mark_node == 1 or mark_note == 1:
                    with open("temp.html", mode="a", encoding="UTF-8") as f:
                        if line.rfind(\'<richcontent TYPE="NODE"><html>\') != -1 or \
                                line.rfind(\'<richcontent TYPE="NOTE"><html>\') != -1:
                            f.write("<html>\n")
                            num += 1
                        elif line.rfind(\'</html>\') != -1:
                            f.write("</html>\n")
                            num += 1
                        elif line.rfind(\'</richcontent>\') != -1:
                            num += 1
                        elif line.rfind(\'</node>\') != -1:
                            point -= 1
                            if mark_node == 1: mark_node = 2
                            if mark_note == 1: mark_note = 2
                            num += 1
                        else:
                            f.write(line + "\n")
                            num += 1
                    continue
                if mark_node == 2 or mark_note == 2:
                    data = analyse_html("./temp.html")
                    print("data = ", data)
                    os.remove("./temp.html")
                    if mark_node == 2:
                        # 操作 node 节点信息
                        for i in range(len(data)):
                            result = data[i].replace(\'\n\', \'\')
                            print(result)
                    if mark_note == 2:
                        # 操作 note 备注信息
                        for i in range(len(data)):
                            result = data[i].replace(\'\n\', \'\')
                            print(result)
                    if mark_node != 0: mark_node = 0
                    if mark_note != 0: mark_note = 0
                if line.rfind(\'<map version="1.0.1">\') == 0 and num == 1:
                    num += 1

                if line.rfind(\'</map>\') == 0:
                    print("解析文件完成！共解析 {} 行。".format(num))
                elif line.rfind(\'</node>\') == 0:
                    point -= 1
                    num += 1
                elif line.rfind(\'<node \') == 0:
                    point += 1
                    if line.rfind(\'" TEXT="\') != -1 and line[-2:] == \'">\':
                        start_num = line.rfind(\'" TEXT="\') + 8
                        print("start num = ", start_num)
                        get_value = get_chinese(line[start_num: len(line) - 2])
                        print(get_value)
                    elif line.rfind(\'" TEXT="\') != -1 and line[-2:] == \'/>\':
                        start_num = line.rfind(\'" TEXT="\') + 8
                        print("start num = ", start_num)
                        get_value = get_chinese(line[start_num: len(line) - 3])
                        print(get_value)
                        point -= 1
if line.rfind(\'" TEXT="\') == -1:
                        mark_node = 1                    # 存在 HTML 网页
                    num += 1

                    if len(flow) == 0:
                        flow = "{}".format(point)
                    else:
                        if point == int(flow.split("_")[len(flow.split("_")) - 1]):
                            pass
                        else:
                            if point < int(flow.split("_")[len(flow.split("_")) - 1]):
                                flow = flow.split(str(point))[0] + str(point)
                            else:
                                flow = "{}_{}".format(flow, point)
                    print("总体的线性流程：", flow)

                elif line.rfind(\'<richcontent TYPE="NOTE"><html>\') == 0:
                    with open("temp.html", mode="a", encoding="UTF-8") as f:
                        f.write(\'<html>\n\')
                    mark_note = 1                        # 存在备注信息
                elif line.rfind(\'<icon \') == 0:
                    print(line)
                    num += 1
                elif line.rfind(\'<arrowlink \') == 0:    # 箭头指向，可以实现关联
                    print(line)
                    num += 1
                elif line.rfind(\'<hook \') == 0:
                    print(line)
                    num += 1
                elif line.rfind(\'<text>\') == 0:
                    # point = point + 1
                    print(line)
                    num += 1
                elif line.rfind(\'</hook>\') == 0:
                    print(line)
                    num += 1
                elif line.rfind(\'<cloud/>\') == 0:
                    print(line)
                    num += 1
                elif line.rfind(\'<font \') == 0:
                    print(line)
                    num += 1
                elif line.rfind(\'<edge \') == 0:
                    print(line)
                    num += 1
                else:
                    num += 1

    else:
        print("系统中没有找到没有FreeMind文件。{}".format(mm_file))
        exit()


def analyse_html(file_path):
    with open(file=file_path, mode="r", encoding="UTF-8") as f:
        page = f.read()
    html_parser = HP()
    html_parser.feed(page)
    html_parser.close()
    return html_parser.data


def get_chinese(line):
    get_word = ""
    array = line.split("&#x")
    flag = True
    if line.find("&#x") != -1:
        for i in range(len(array)):                # 遍历数组
            if len(array[i]) == 0 and flag:        # 第一个值为空时，继续循环
                flag = False
                continue

            if array[i][4:5] == ";":               # 解析Unicode字符
                unicode = "\\u" + array[i][:4]
                get_word = get_word + unicode.encode(\'latin-1\').decode(\'unicode_escape\') + array[i][5:]
            elif array[i][:2] == "a;":             # 换行转义
                get_word = get_word + "\n" + array[i][2:]
            else:
                get_word = get_word + array[i]

        return get_word
    else:
        return line.replace(\'&amp;\', \'&\')


class HP(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.tag_text = False
        self.data = []

    def handle_starttag(self, tag, attr):
        if tag == \'p\' or tag == \'li\':
            self.tag_text = True
        if tag == \'img\' and self._attr(attr, \'src\'):
            self.data.append("img:{}".format(self._attr(attr, \'src\')))

    def handle_endtag(self, tag):
        if tag == \'p\' or tag == \'li\':
            self.tag_text = False

    def handle_data(self, data):
        if self.tag_text:
            self.data.append(get_chinese(data))

    def _attr(self, attr_list, attr_name):
        for attr in attr_list:
            if attr[0] == attr_name:
                return attr[1]
        return None


analyse_mm_file("./mm/思维导图.mm")

由于xmind可以通过导出，选择格式为freemind，但是导出的结果是 0.8.1 版本的freemind结果，而且会将所有内容呈现在一行中

所以需要对xmind导出的结果进行简要的修改

import uuid

file_path = r\'/path/to/mm/file.mm\'
with open(file=file_path, mode=\'r\', encoding=\'utf-8\') as f:
    content = f.readlines()

for line in content:
    if \'<map version="0.8.1">\' in line:
        get_str = str(content).rstrip("\']").lstrip("[\'").replace(\'"><\', \'">\n<\')\
            .replace(\'></\', \'>\n</\').replace("\\n\', \'", "\\n")\
            .replace(\'</node><node\', \'</node>\n<node\').replace(\'"/><node\', \'"/>\n<node\')
    elif \'<map version="1.0.1">\' in line:
        get_str = ""
    break

file_name = r\'./temp-\' + str(uuid.uuid1()).replace(\'-\', \'\') + \'.txt\'
with open(file=file_name, mode=\'a\', encoding=\'UTF-8\') as f:
    print(get_str, file=f, flush=True)

print(get_str)

生活和学习都是层层递进的，在经历了，痛苦的解析之后，发现可以直接通过解析xml文件的方式，进行解析最终结果

少啰嗦，上代码：

import zipfile
import os
import io
import sys
import hashlib
import xml.etree.ElementTree as ET


class AnalyseMindMap:
    def __init__(self, file_path, mark):
        self.file_path = file_path
        self.context = ""
        if mark == "":
            self.mark = "    "
        elif mark == "#":
            self.mark = "#"
        elif mark == "*":
            self.mark = "*"

    # 解析xmind数据
    def analyse_xmind(self):
        file_name = os.path.basename(self.file_path)
        if os.path.isfile(self.file_path):
            base_dir = os.path.dirname(self.file_path)
            m = hashlib.md5()
            file = io.FileIO(self.file_path, \'r\')
            read_bytes = file.read(1024)
            while read_bytes != b\'\':
                m.update(read_bytes)
                read_bytes = file.read(1024)
            file.close()
            md5value = m.hexdigest()
            for dir_name in os.listdir(base_dir):
                if dir_name == md5value:
                    print(\'已经存在了该文件\', md5value)
                    continue
            file_zip = zipfile.ZipFile(self.file_path, \'r\')
            for file in file_zip.namelist():
                file_zip.extract(file, base_dir + \'/\' + md5value)
            file_zip.close()
            xml_file = os.path.join(base_dir, md5value, \'content.xml\')
            return self.analyse_xml(xml_file)
        else:
            return "{} 不存在".format(file_name)
        # file_list = os.listdir(self.file_path)
        #
        # for file_name in file_list:
        #     print(file_name)
        #     if os.path.splitext(file_name)[1] == \'.xmind\':
        #         print(file_name)
        #         zip_file = os.path.join(self.file_path, file_name)
        #         m = hashlib.md5()
        #         file = io.FileIO(zip_file, \'r\')
        #         read_bytes = file.read(1024)
        #         while read_bytes != b\'\':
        #             m.update(read_bytes)
        #             read_bytes = file.read(1024)
        #         file.close()
        #         md5value = m.hexdigest()
        #         for dir_name in os.listdir(r\'Upload/\'):
        #             if dir_name == md5value:
        #                 print(\'已经存在了该文件\', md5value)
        #                 continue
        #         file_zip = zipfile.ZipFile(zip_file, \'r\')
        #         for file in file_zip.namelist():
        #             file_zip.extract(file, r\'./\' + md5value)
        #         file_zip.close()
        #         xml_file = os.path.join(\'./\', md5value, \'content.xml\')
        #         self.analyse_xml(xml_file)
                # os.remove(file_name)

    # 解析xml文件
    def analyse_xml(self, xml_file):
        try:
            tree = ET.parse(xml_file)
            # 获得根节点
            root = tree.getroot()
        except Exception as e:  # 捕获除与程序退出sys.exit()相关之外的所有异常
            print("parse test.xml fail!")
            sys.exit()
        pre_tag = \'{\' + root.tag.split(\'{\')[1].split(\'}\')[0] + \'}\'
        title_path = pre_tag + \'sheet/\' + pre_tag + \'topic/\' + pre_tag + \'title\'
        print("h1.", root.find(title_path).text)
        self.context = "\nh1. " + root.find(title_path).text + "\n"
        plain_path = pre_tag + \'sheet/\' + pre_tag + \'topic/\' + pre_tag + \'notes/\' + pre_tag + \'plain\'
        if root.find(plain_path) is not None:
            print("<pre>备注：" + root.find(plain_path).text + "</pre>")
            self.context += "<pre>备注：" + root.find(plain_path).text + "</pre>" + "\n\n"
        third_path = pre_tag + \'sheet/\' + pre_tag + \'topic/\' + pre_tag + \'children\'
        num = 1        # 对分层进行标记
        for first_topic in root.findall(third_path):
            self.recursive_xml(first_topic, pre_tag, num)

        return self.context

    # 递归调用获取元素值
    def recursive_xml(self, root, pre_tag, num):
        topics_path = pre_tag + \'topics\'    # 一个children下面可能会有多个topics，所以需要循环一下
        for topics in root.findall(topics_path):
            topic_path = pre_tag + \'topic\'
            for topic in topics.findall(topic_path):
                title_path = pre_tag + \'title\'
                if num > 1:
                    print(self.mark * (num - 1), topic.find(title_path).text)
                    self.context += self.mark * (num - 1) + " " + topic.find(title_path).text + "\n"
                else:
                    print("h3.", topic.find(title_path).text)
                    self.context += "\nh3. " + topic.find(title_path).text + \'\n\n\'

                plain_path = pre_tag + \'notes/\' + pre_tag + \'plain\'
                if topic.find(plain_path) is not None:
                    print("<pre>备注："+topic.find(plain_path).text + "</pre>")
                    self.context += "<pre>备注："+topic.find(plain_path).text+"</pre>" + "\n\n"
                label_path = pre_tag + \'labels/\' + pre_tag + \'label\'
                if topic.find(label_path) is not None:
                    print("-->标签：", topic.find(label_path).text + "<--")
                    self.context += "-->标签：" + topic.find(label_path).text + "<--" + "\n\n"
                children_path = pre_tag + \'children\'
                for new_topic in topic.findall(children_path):
                    self.recursive_xml(new_topic, pre_tag, num+1)

    # 解析freemind的xml文件
    def analyse_mm_xml(self):
        if os.path.isfile(self.file_path):
            try:
                tree = ET.parse(self.file_path)
                # 获得根节点
                root = tree.getroot()
            except Exception as e:  # 捕获除与程序退出sys.exit()相关之外的所有异常
                print("parse test.xml fail!")
                sys.exit()
            node_path = "node"
            num = 1
            for node in root.findall(node_path):
                print("h1.", node.attrib[\'TEXT\'])
                self.context += "\nh1. " + node.attrib[\'TEXT\'] + "\n"
                if node.find(\'richcontent\') is not None:
                    context_p = \'richcontent/html/body/p\'
                    print(\'<pre>备注：\', node.find(context_p).text.replace(\' \', \'\').replace(\'\n\', \'\'), \'</pre>\')
                    self.context += \'<pre>备注：\' + node.find(context_p).text.replace(\' \', \'\').replace(\'\n\', \'\') + \'</pre>\' + "\n\n"
                self.recursive_node(node, num)
            return self.context

    # 递归运行查看结果
    def recursive_node(self, root, num):
        node_path = \'node\'
        richcontent_path = \'richcontent\'
        for node in root.findall(node_path):
            if \'TEXT\' in node.attrib:
                if num > 1:
                    print(self.mark * (num - 1), node.attrib[\'TEXT\'])
                    self.context += self.mark * (num - 1) + \' \' + node.attrib[\'TEXT\'] + "\n"
                else:
                    print("h3.", node.attrib[\'TEXT\'])
                    self.context += "\nh3. " + node.attrib[\'TEXT\'] + "\n\n"
            if node.find(richcontent_path) is not None:
                context_p = \'richcontent/html/body/p\'
                context_out = \'\'
                for p in node.findall(context_p):
                    context_out += p.text.replace(\' \', \'\').replace(\'\n\', \'\') + \'\n\'
                print(\'<pre>备注：\', context_out, \'</pre>\')
                self.context += \'<pre>备注：\' + context_out + \'</pre>\' + "\n\n"
            if node.find(node_path) is not None:
                self.recursive_node(node, num + 1)


if __name__ == \'__main__\':
    file_path = r\'C:\path\to\file.xmind\'
    amm = AnalyseMindMap(file_path, "#")
    amm.analyse_xmind()
    file_path = r\'C:\path\to\file.mm\'
    amm = AnalyseMindMap(file_path, "#")
    amm.analyse_mm_xml()

================我是底线================