【问题标题】:Error while doing multi-class classification in spacy在 spacy 中进行多类分类时出错
【发布时间】:2018-08-05 00:29:48
【问题描述】:

我正在尝试进行多类分类并使用众包文本分类dataset。下面是我的代码:

from __future__ import unicode_literals, print_function
from __future__ import unicode_literals

from pathlib import Path

import pandas as pd
import spacy
from spacy.util import minibatch, compounding




def main(model=None, output_dir=None, n_iter=20):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    for i in ['neutral','worry','happiness','sadness','love','surprise','fun','relief','hate','enthusiasm','boredom','anger']:
        textcat.add_label(i)


    df = pd.read_csv('text_emotion.csv')
    df.drop(['tweet_id', 'author'], axis=1, inplace=True)
    df = df[df['sentiment'] != 'empty']
    train_data = list(zip(unicode(df['content']),
                          [{u'cats': unicode(cats)} for cats in df['sentiment']]))




    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t'.format('LOSS'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                # print('texts: '+str(texts))
                # print('annotations: '+str(annotations))
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,losses=losses)
            # with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()

            print('{0:.3f}'  # print a simple table
                  .format(losses['textcat']))

    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)



if __name__ == '__main__':
    main()

我收到以下错误:

Traceback (most recent call last):
batch: [(u'1', {u'cats': u'sadness'}), (u' ', {u'cats': u'sadness'}), (u' ', {u'cats': u'enthusiasm'}), (u' ', {u'cats': u'neutral'})]
  File "/Users/loginofdeath/Documents/24Feb/emo_cat.py", line 91, in <module>
    main()
  File "/Users/loginofdeath/Document/24Feb/emo_cat.py", line 63, in main
    nlp.update(texts, annotations, sgd=optimizer, drop=0.2,losses=losses)
  File "/usr/local/lib/python2.7/site-packages/spacy/language.py", line 399, in update
    gold = GoldParse(doc, **gold)
  File "gold.pyx", line 430, in spacy.gold.GoldParse.__init__
ValueError: dictionary update sequence element #0 has length 1; 2 is required

我正在使用: Python版本:2.7.14

平台:Darwin-16.4.0-x86_64-i386-64bit

spaCy 版本:2.0.9

型号:zh

有人可以帮助我吗?我对 spacy 中的多类分类的方法是否正确?提前致谢。

【问题讨论】:

    标签: python nlp spacy multiclass-classification


    【解决方案1】:

    这个答案的全部功劳归于Vikas Singh。 下面是代码:

    from __future__ import unicode_literals, print_function
    from __future__ import unicode_literals
    
    from pathlib import Path
    
    import pandas as pd
    import spacy
    import copy
    from spacy.util import minibatch, compounding
    import re
    
    def clean_string(mystring):
        return re.sub('[^A-Za-z\ 0-9 ]+', '', mystring)
    
    
    
    def main(model=None, output_dir=None, n_iter=2):
        if model is not None:
            nlp = spacy.load(model)  # load existing spaCy model
            print("Loaded model '%s'" % model)
        else:
            nlp = spacy.blank('en')  # create blank Language class
            print("Created blank 'en' model")
    
        # add the text classifier to the pipeline if it doesn't exist
        # nlp.create_pipe works for built-ins that are registered with spaCy
        if 'textcat' not in nlp.pipe_names:
            textcat = nlp.create_pipe('textcat')
            nlp.add_pipe(textcat, last=True)
        # otherwise, get it, so we can add labels to it
        else:
            textcat = nlp.get_pipe('textcat')
    
        # add label to text classifier
        for i in ['neutral','worry','happiness','sadness','love','surprise','fun','relief','hate','enthusiasm','boredom','anger']:
            textcat.add_label(i)
    
    
        df = pd.read_csv('text_emotion.csv')
        df.drop(['tweet_id', 'author'], axis=1, inplace=True)
        df = df[df['sentiment'] != 'empty']
    
        sentiment_values = df['sentiment'].unique()
        labels_default = dict((v, 0) for v in sentiment_values)
    
        train_data = []
        for i, row in df.iterrows():
    
            label_values = copy.deepcopy(labels_default)
            label_values[row['sentiment']] = 1
    
            train_data.append((unicode(clean_string(row['content'])), {"cats": label_values}))
    
        train_data = train_data[:5000]
    
        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
        with nlp.disable_pipes(*other_pipes):  # only train textcat
            optimizer = nlp.begin_training()
            print("Training the model...")
            print('{:^5}\t'.format('LOSS'))
            for i in range(n_iter):
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(train_data, size=compounding(4., 32., 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    # print('texts: '+str(texts))
                    # print('annotations: '+str(annotations))
                    nlp.update(texts, annotations, sgd=optimizer, drop=0.2,losses=losses)
                # with textcat.model.use_params(optimizer.averages):
                    # evaluate on the dev data split off in load_data()
    
                print('{0:.3f}'  # print a simple table
                      .format(losses['textcat']))
    
        # test the trained model
        test_text = "This movie sucked"
        doc = nlp(test_text)
        print(test_text, sorted(doc.cats.items(), key=lambda val: val[1], reverse=True))
    
        if output_dir is not None:
            output_dir = Path(output_dir)
            if not output_dir.exists():
                output_dir.mkdir()
            nlp.to_disk(output_dir)
            print("Saved model to", output_dir)
    
            # test the saved model
            print("Loading from", output_dir)
            nlp2 = spacy.load(output_dir)
            doc2 = nlp2(test_text)
            print(test_text, doc2.cats)
    
    
    
    if __name__ == '__main__':
        main()
    

    请注意,在代码中我们只训练了训练数据的 5000 个数据点。我希望这将清除大多数关于 spaCy 中多类分类的疑问。

    【讨论】:

      猜你喜欢
      • 2018-08-05
      • 1970-01-01
      • 2017-04-09
      • 1970-01-01
      • 2021-05-30
      • 1970-01-01
      • 2020-10-02
      • 1970-01-01
      • 2022-01-15
      相关资源
      最近更新 更多