y 应该是一维数组，得到一个 shape () 数组答案

【问题标题】：y should be a 1d array, got an array of shape () insteady 应该是一维数组，得到一个 shape () 数组
【发布时间】：2021-09-14 09:51:22
【问题描述】：

我有一个经过训练并保存的模型。我正在尝试根据新数据进一步训练模型，但它给出了错误。相关部分代码：

from tensorflow.keras.preprocessing.text import Tokenizer
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(master_df['Observation'].values)
word_index = tokenizer.word_index

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_df=1.0,min_df=1, stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(X)

with open("../sgd.pickle", 'rb') as f:
    sgd = pickle.load(f)

def output_sample(sentence):
    test=preprocess_text(sentence)
    test=test.lower()
    #print(test)
    test=[test] 
    tokenizer.fit_on_sequences(test)
    new_words= tokenizer.word_index
    #print(word_index)``
    test1=cv.transform(test)
    #print(test1)
    output=sgd.predict(test1)
    return output[0]

def retrain(X,y):
    X=preprocess_text(X)
    X=X.lower()
    X=[X]
    tokenizer.fit_on_texts(X)
    new_words=tokenizer.word_index
    X=cv.fit_transform(X)
    sgd.fit(X,y)
    with open('sgd.pickle', 'wb') as f:
        pickle.dump(sgd, f)
    print("Model trained on new data")

sentence=input("\n\nEnter your observation:\n\n")
output=output_sample(sentence)
print("\n\nThe risk prediction is",preprocess_text(output),"\n\n")

print("Is the above prediction correct?\n")
corr=input("Press 'y' for yes or 'n' for no.\n")

if corr=='y':
    newy=np.array(output)
    retrain(sentence,newy)

elif corr=='n':

    print("What is the correct risk?\n1. Low\n2. Medium\n")
    r=input("Enter the appropriate number: ")

    if r=='1':
        newy=np.array('Low')
        retrain(sentence,newy)
    elif r=='2':
        newy=np.array('Medium')
        retrain(sentence,newy)
    else:
        print("Incorrect input. Please restart the application.")

else:
    print("Incorrect input. Please restart the application")

当程序运行时，错误发生在sgd.fit(X,y)。错误是

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_11300/3528077041.py in <module>
      5     newy=[output]
      6     print(newy)
----> 7     retrain(sentence,newy)
      8 
      9 elif corr=='n':

~\AppData\Local\Temp/ipykernel_11300/2433836763.py in retrain(X, y)
      7     X=cv.fit_transform(X)
      8     #y = y.reshape((-1, 1))
----> 9     sgd.fit(X,y)
     10     with open('sgd.pickle', 'wb') as f:
     11         pickle.dump(sgd, f)

~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
    344             if self._final_estimator != 'passthrough':
    345                 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 346                 self._final_estimator.fit(Xt, y, **fit_params_last_step)
    347 
    348         return self

~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py in fit(self, X, y, coef_init, intercept_init, sample_weight)
    727             Returns an instance of self.
    728         """
--> 729         return self._fit(X, y, alpha=self.alpha, C=1.0,
    730                          loss=self.loss, learning_rate=self.learning_rate,
    731                          coef_init=coef_init, intercept_init=intercept_init,

~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
    567         self.t_ = 1.0
    568 
--> 569         self._partial_fit(X, y, alpha, C, loss, learning_rate, self.max_iter,
    570                           classes, sample_weight, coef_init, intercept_init)
    571 

~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py in _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, classes, sample_weight, coef_init, intercept_init)
    529                              max_iter=max_iter)
    530         else:
--> 531             raise ValueError(
    532                 "The number of classes has to be greater than one;"
    533                 " got %d class" % n_classes)

ValueError: The number of classes has to be greater than one; got 1 class

数据样本如下：

Observation                                             Risk
0   A separate road for light vehicle should be ma...   Low
2   All benches were not having sufficient berm.        Low
3   As light arrangement is not adequate.               Low
4   As light arrangement is not adequate.               Low
5   As contractor's equipment record is not availa...   Low
77  First aid Room is not established.                  Medium
98  Heavy dust on haul road is found with in suffi...   Medium
79  First aid station is maintained in the Rest sh...   Medium
171 Presently explosive van is not available with ...   Medium
79  First aid station is maintained in the Rest sh...   Medium

理想情况下它应该接受输入，但我不知道为什么它会给出这个错误。

【问题讨论】：

在调用fit方法前添加y = y.reshape((-1, 1))。
@meti 我试过这个，现在它给出了这个错误The number of classes has to be greater than one; got 1 class。
错误信息表明y向量只包含一种标签类型！如果可能，请添加您的数据；）@Chinmay Datar
@meti 在问题中添加了数据样本

标签： python machine-learning nlp

【解决方案1】：

我清理了代码并对retrain 函数进行了一些更改，现在该函数将向训练集添加一个新的字符串和标签，并再次拟合分类器。代码的其他部分在逻辑上保持不变！

实用功能：

def output_sample(sentence):
    test=preprocess_text(sentence)
    test=test.lower()
    test=[test] 
    tokenizer.fit_on_sequences(test)
    new_words= tokenizer.word_index
    test1=cv.transform(test)
    output=sgd.predict(test1)
    return output[0]

def preprocess_text(string):
    # do whatever you want but return String afterward ;)
    return string

def retrain(X,y):
    X=preprocess_text(X)
    X=X.lower()
    X=[X]
    X = cv.fit_transform(master_df['Observation']+X)
    new_words=tokenizer.word_index
    sgd.fit(X,master_df['Risk']+y)
    with open('sgd.pickle', 'wb') as f:
        pickle.dump(sgd, f)
    print("Model trained on new data")

实际流程：

import numpy as np 
import pickle
import nltk
from sklearn.feature_extraction.text import CountVectorizer
stopwords = nltk.corpus.stopwords.words('english')
cv=CountVectorizer(max_df=1.0,min_df=1, stop_words=stopwords, max_features=10000, ngram_range=(1,3))
master_df = pd.read_csv('classification.tsv')
X=cv.fit_transform(master_df['Observation'])
from sklearn.linear_model import SGDClassifier

try:
    f = open("./sgd.pickle", 'rb')
    sgd = pickle.load(f)
except:
    sgd = SGDClassifier()

sgd.fit(X, master_df['Risk'].to_list())


sentence=input("\n\nEnter your observation:\n\n")
output=output_sample(sentence)
print("\n\nThe risk prediction is",preprocess_text(output),"\n\n")

print("Is the above prediction correct?\n")
corr=input("Press 'y' for yes or 'n' for no.\n")

if corr=='y':
    newy=np.array(output)
    retrain(sentence, newy)

elif corr=='n':

    print("What is the correct risk?\n1. Low\n2. Medium\n")
    r=input("Enter the appropriate number: ")

    if r=='1':
        newy=np.array('Low')
        retrain(sentence,newy)
    elif r=='2':
        newy=np.array('Medium')
        retrain(sentence,newy)
    else:
        print("Incorrect input. Please restart the application.")

else:
    print("Incorrect input. Please restart the application")

【讨论】：

非常感谢你，伙计。有用。只是好奇当我们调用.fit 时，它是从头开始训练模型还是只是根据新输入进一步训练模型？