【问题标题】:Jupyter Kernel dies/Spyder console stops while training custom NER model in Spacy 2.0.11在 Spacy 2.0.11 中训练自定义 NER 模型时,Jupyter 内核死亡/Spyder 控制台停止
【发布时间】:2020-07-24 22:32:14
【问题描述】:

我试图在 spacy 中训练一个自定义 NER 模型。最初我安装了最新的 spacy 版本,但在训练过程中出现以下错误

ValueError: [E103] Trying to set conflicting doc.ents: token 只能是一个实体的一部分,因此请确保您设置的实体不重叠。

之后我安装了 spacy 版本 spacy==2.0.11 并尝试运行我的代码。当我有大约 10 行数据要训练时,模型运行良好,并且正在保存到我的输出目录中。但是当有更多数据(5K行)作为原始训练数据时,我的jupyter内核死了或者当我在spyder中运行时,控制台就存在了!!

我了解已弃用的 spacy 版本不会引发值错误,但它仍然没有用,因为我无法训练我的模型。

样本数据:

CarryBag    09038820815c.txt
Stopperneedle   0903882080f4.txt
Foilbags    09038820819.txt

我有大约 700 个这样的文件,其中包含要标记的数据,并且在每个文件中都有多个实体需要标记。 参考代码:

import spacy
# import en_core_web_sm
import re
import csv
from spacy.matcher import PhraseMatcher
import plac
from pathlib import Path
import random

#Function to convert PhraseMatcher return value to string indexes 
def str_index_conversion(lbl, doc, matchitem):
    o_one = len(str(doc[0:matchitem[1]]))
    subdoc = doc[matchitem[1]:matchitem[2]]
    o_two = o_one + len(str(subdoc))
    return (o_one, o_two, lbl)

# nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

ner.add_label('PRODUCT')     

DIR = 'D:/Docs/'
matcher = PhraseMatcher(nlp.vocab)


list_str_index = []
to_train_ents = []
with open(r'D:\ner_dummy_pack.csv', newline='', encoding ='utf-8') as myFile:

    reader = csv.reader(myFile)
    for row in reader:
        try:
            product = row[0].lower()
            #print('K---'+ product)
            filename = row[1]
            file = open(DIR+filename, "r", encoding ='utf-8')
            print(file)
            filecontents = file.read()
            for s in filecontents:
                filecontents = re.sub(r'\s+', ' ', filecontents)
                filecontents = re.sub(r'^https?:\/\/.*[\r\n]*', '', filecontents, flags=re.MULTILINE)
                filecontents = re.sub(r"http\S+", "", filecontents)
                filecontents = re.sub(r"[-\"#/@;:<>?{}*`• ?+=~|$.!‘?“”?,_]", " ", filecontents)
                filecontents = re.sub(r'\d+', '', filecontents)#removing all numbers
                filecontents = re.sub(' +', ' ',filecontents)
                #filecontents = filecontents.encode().decode('unicode-escape')
                filecontents = ''.join([line.lower() for line in filecontents])
                if "," in product:
                    product_patterns = product.split(',')
                    product_patterns = [i.strip() for i in product_patterns]

                    for elem in product_patterns:
                        matcher.add('PRODUCT', None, nlp(elem)) 

                else:
                    matcher.add('PRODUCT', None, nlp(product))                
                print(filecontents)
                doc = nlp(filecontents)
                matches = matcher(doc)
                        #print(matches)
                list_str_index = [str_index_conversion('PRODUCT', doc, x) for x in matches]
                to_train_ents.append((filecontents, dict(entities=list_str_index)))
                break


        except Exception as e:
            print(e)
            pass

to_train_entsfinal=to_train_ents      




def main(model=None, output_dir=None, n_iter=100):
    # nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
    optimizer = nlp.begin_training()
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(10):
            losses = {}
            random.shuffle(to_train_entsfinal)
            for item in to_train_entsfinal:
                nlp.update([item[0]],
                           [item[1]],
                           sgd=optimizer,
                           drop=0.50,
                           losses=losses)
            print(losses)
            print("OUTTTTT")


    if output_dir is None:
        output_dir = "C:\\Users\\APRIL"


    noutput_dir = Path(output_dir)
    if not noutput_dir.exists():
        noutput_dir.mkdir()

    #nlp.meta['name'] = new_model_name
    nlp.to_disk(output_dir)


    random.shuffle(to_train_entsfinal)

if __name__ == '__main__':
    main()   

谁能帮我解决这个问题。即使我删除了 10 多行样本中的冲突实体,例如:

Blister       abc.txt
Blisterpack   abc.txt
Blisters      abc.txt   

同样的问题正在发生,模型没有训练

建议的更改:

def main(model=None, output_dir=None, n_iter=100):
    top_memory_precentage_use = 75 # or what ever number you choose

    def handle_memory(ruler):
        if psutil.virtual_memory().percent < top_memory_precentage_use:
            dump_ruler_nonascii(ruler)
            ruler = nlp.begin_training() #or just init the nlp object again
        return ruler

    # This fitted for my use case
    def dump_ruler_nonascii(ruler):
        path = Path(os.path.join(self.data_path, 'config.jsonl'))
        pattern = ruler.patterns
        with open(path, "a", encoding="utf-8") as f:
            for line in pattern:
                f.write(json.dumps(line, ensure_ascii=False) + "\n")
        return ruler
    # nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
    optimizer = nlp.begin_training()
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(10):
            losses = {}
            random.shuffle(to_train_entsfinal)
            for item in to_train_entsfinal:
                nlp.update([item[0]],
                           [item[1]],
                           sgd=optimizer,
                           drop=0.50,
                           losses=losses)
            print(losses)
            print("OUTTTTT")


    if output_dir is None:
        output_dir = "C:\\Users\\APRIL"


    noutput_dir = Path(output_dir)
    if not noutput_dir.exists():
        noutput_dir.mkdir()

    #nlp.meta['name'] = new_model_name
    nlp.to_disk(output_dir)


    random.shuffle(to_train_entsfinal)

if __name__ == '__main__':
    main()   

【问题讨论】:

    标签: python-3.x machine-learning nlp spacy valueerror


    【解决方案1】:

    很难告诉你为什么会发生这种情况,但我可以为你提供 2 个帮助函数来训练循环。您可以根据使用情况进行调整。就我而言,它正在编写模式,并且我检查了每次迭代的内存使用情况。

    #add the following imports
    import psutil
    import os
    
    
    top_memory_precentage_use = 75 # or what ever number you choose
    
    def handle_memory(ruler):
        if psutil.virtual_memory().percent < top_memory_precentage_use:
            dump_ruler_nonascii(ruler)
            ruler = nlp.begin_training() #or just init the nlp object again
        return ruler
    
    # This fitted for my use case
    def dump_ruler_nonascii(ruler):
        path = Path(os.path.join(self.data_path, 'config.jsonl'))
        pattern = ruler.patterns
        with open(path, "a", encoding="utf-8") as f:
            for line in pattern:
                f.write(json.dumps(line, ensure_ascii=False) + "\n")
    

    【讨论】:

    • 嗨@Green 感谢您的回复。你能告诉我我应该在代码的哪一部分使用这些功能吗
    • 在我的训练循环中写的
    • 我已经更新了建议更改下的更改:在我的问题中。你能否让我知道它是否是正确的实现。运行带有更改的代码后,我仍然面临同样的问题
    • 你在哪里调用了这个函数?好像你只是写了函数并没有调用它。
    • 我已更改代码并在问题部分进行了更新,但同样的问题仍然存在。你能帮我理解你的代码在做什么吗?以及这如何帮助您解决问题
    猜你喜欢
    • 1970-01-01
    • 2022-06-10
    • 1970-01-01
    • 2020-08-06
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2020-10-16
    • 1970-01-01
    相关资源
    最近更新 更多