【发布时间】:2018-08-23 11:58:57
【问题描述】:
所以,我正在构建一个基本的 Keras 文本分类器,但无论我做什么,我都无法使验证准确度高于 49-50%(或更低)。我的训练准确率非常正常地攀升,从 50% 左右开始,在 4-5 个 epoch 后攀升至 80% 左右。
这是一个输出示例:
- 54s - loss: 0.6982 - acc: 0.5064 - val_loss: 0.6932 - val_acc: 0.4950
Epoch 2/3
- 57s - loss: 0.6560 - acc: 0.6580 - val_loss: 0.7324 - val_acc: 0.4950
Epoch 3/3
- 60s - loss: 0.5359 - acc: 0.7047 - val_loss: 0.7339 - val_acc: 0.4955
这是我的代码:
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
import numpy as np
import os
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.preprocessing import sequence
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras import optimizers
np.random.seed(7)
class TextClassifier:
def __init__(self):
self.tokenizer = Tokenizer(num_words = 5000)
self.top_words = 5000
self.max_words = 500
self.model = model = Sequential()
model.add(Embedding(self.top_words,64,input_length = self.max_words))
model.add(Conv1D(filters = 64,kernel_size=3,padding='same',activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(250,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
def train(self, X_train, y_train, X_test, y_test):
self.model.fit(X_train,y_train, shuffle = True,
validation_data = (X_test,y_test),epochs = 3,batch_size=512,verbose = 2)
def predict(self,X):
return self.model.predict(X)
def init_tokenizer(self,X):
self.tokenizer.fit_on_texts(X)
def eval(self,X,y):
return self.model.evaluate(X, y, verbose=0)
def proccess_text(self,X):
vocab_text = self.tokenizer.texts_to_sequences(X)
vocab_text = sequence.pad_sequences(vocab_text,maxlen=500)
return vocab_text
def organize_text(self,pos_path,neg_path):
data = {'label':[],'text':[]}
pos_texts = self.text_to_array(pos_path)
neg_texts = self.text_to_array(neg_path)
for i in pos_texts:
data['label'].append(0)
data['text'].append(i)
for i in neg_texts:
data['label'].append(1)
data['text'].append(i)
return data
def text_to_array(self,path):
''' Takes a path argument and retrieves all the text lines from
within a folder'''
name_list = []
texts = []
for file_ in os.listdir(path):
name_list.append(file_)
for i in name_list:
file_ = open(path+i)
texts.append(file_.read().splitlines())
return texts
from TextClassifier import *
path = './data/train/'
test_path = './data/test/'
model = TextClassifier()
data = model.organize_text(path+'pos/',path+'neg/')
tests = model.organize_text(test_path+'pos/',test_path+'neg/')
model.init_tokenizer(data['text'])
model.init_tokenizer(tests['text'])
X_train = np.array(model.proccess_text(data['text']))
X_test = np.array(model.proccess_text(tests['text']))
y_train = data['label']
y_test = tests['label']
model.train(X_train,y_train,X_test,y_test)
scores = model.eval(X_test,y_test)
input_ = model.proccess_text(['It was very good! Awesome! Enjoyable!'])
print("Predict: ")
predict = model.predict(input_)
print(predict)
print("Accuracy: %.2f%%" % (scores[1]*100))
我的语料库来自这里:http://ai.stanford.edu/~amaas/data/sentiment/,我只是将每个类的前 1000 个作为验证数据。 (他们是无序的 afaik)
【问题讨论】:
标签: python tensorflow keras