【发布时间】:2020-11-27 07:47:32
【问题描述】:
我一直在研究一个具有一个隐藏层的神经网络,三层中的每一层都有灵活数量的节点。代码如下:
import time
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
class_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
class NeuralNetwork():
correct = 0
num_predictions = 10
epochs = 100
sizeOfEpoch = 5000
Lambda = 10
learningRate = 0.00001
def __init__(self, sizes):
self.dimensions = sizes
self.x = np.arange(1,self.epochs+1)
self.y = np.empty(self.epochs)
self.secondLayerNeurons = np.empty(sizes[1])
self.outputNeurons = np.empty(sizes[2])
self.firstLayerWeights = np.random.rand(sizes[1], sizes[0])
self.secondLayerWeights = np.random.rand(sizes[2], sizes[1])
self.firstLayerBiases = np.random.rand(sizes[1])
self.secondLayerBiases = np.random.rand(sizes[2])
self.firstLayerWeightsSummations = np.zeros([sizes[1], sizes[0]])
self.secondLayerWeightsSummations = np.zeros([sizes[2], sizes[1]])
self.firstLayerBiasesSummations = np.zeros([sizes[1]])
self.secondLayerBiasesSummations = np.zeros([sizes[2]])
self.hiddenLayerErrors = np.empty(sizes[1])
self.outputLayerErrors = np.empty(sizes[2])
def sigmoid(self, x):
return 1/(1+np.exp(-x))
def sigmoidDerivative(self, x):
return np.multiply(x,(1-x))
def forwardProp(self, inputs):
for i in range (self.dimensions[1]):
self.secondLayerNeurons[i] = self.sigmoid(np.dot(self.firstLayerWeights[i], inputs)+self.firstLayerBiases[i])
for i in range (self.dimensions[2]):
self.outputNeurons[i] = self.sigmoid(np.dot(self.secondLayerWeights[i], self.secondLayerNeurons)+self.secondLayerBiases[i])
def backProp(self, inputs, correct_output):
self.outputLayerErrors = np.subtract(self.outputNeurons, correct_output)
self.hiddenLayerErrors = np.multiply(np.dot(self.secondLayerWeights.T, self.outputLayerErrors), self.sigmoidDerivative(self.secondLayerNeurons))
for i in range (self.dimensions[2]):
for j in range (self.dimensions[1]):
if j==0:
self.secondLayerBiasesSummations[i] += self.outputLayerErrors[i]
self.secondLayerWeightsSummations[i][j] += self.outputLayerErrors[i]*self.secondLayerNeurons[j]
for i in range (self.dimensions[1]):
for j in range (self.dimensions[0]):
if j==0:
self.firstLayerBiasesSummations[i] += self.hiddenLayerErrors[i]
self.firstLayerWeightsSummations[i][j] += self.hiddenLayerErrors[i]*inputs[j]
def train(self, trainImages, trainLabels):
size = str(self.sizeOfEpoch)
greatestError = 0.0
start_time2 = time.time()
for m in range (self.sizeOfEpoch):
correct_output = np.zeros([self.dimensions[2]])
correct_output[int(class_names[trainLabels[m]])] = 1.0
self.forwardProp(trainImages[m].flatten())
self.backProp(trainImages[m].flatten(), correct_output)
if np.argmax(self.outputNeurons) == int(trainLabels[m]):
self.correct+=1
if m%200 == 0:
error = np.amax(np.absolute(self.outputLayerErrors))
if error > greatestError:
greatestError = error
accuracy = str(int((self.correct/(m+1))*100)) + '%'
percent = str(int((m/self.sizeOfEpoch)*100)) + '%'
print ("Progress: " + percent + " -- Accuracy: " + accuracy + " -- Error: " + str(greatestError), end="\r")
self.change()
time2 = str(round((time.time() - start_time2), 2))
print (size + '/' + size + " -- " + time2 + "s" + " -- Accuracy: " + accuracy + " -- Error: " + str(greatestError), end="\r")
return greatestError
def change(self):
for i in range (self.dimensions[2]):
for j in range (self.dimensions[1]):
if j == 0:
self.secondLayerBiases[i] -= self.learningRate*self.secondLayerBiasesSummations[i]
self.secondLayerWeights[i][j] -= self.learningRate*(self.secondLayerWeightsSummations[i][j]+self.Lambda*self.secondLayerWeights[i][j])
for i in range (self.dimensions[1]):
for j in range (self.dimensions[0]):
if j == 0:
self.firstLayerBiases[i] -= self.learningRate*self.firstLayerBiasesSummations[i]
self.firstLayerWeights[i][j] -= self.learningRate*(self.firstLayerWeightsSummations[i][j]+self.Lambda*self.firstLayerWeights[i][j])
self.firstLayerSummations = np.zeros([self.dimensions[1], self.dimensions[0]])
self.secondLayerSummations = np.zeros([self.dimensions[2], self.dimensions[1]])
self.firstLayerBiasesSummations = np.zeros(self.dimensions[1])
self.secondLayerBiasesSummations = np.zeros(self.dimensions[2])
self.correct = 0
def predict(self, testImage):
secondLayerAnsNodes = np.empty([self.dimensions[1]])
outputAns = np.empty([self.dimensions[2]])
for i in range (self.dimensions[1]):
secondLayerAnsNodes[i] = self.sigmoid(np.dot(self.firstLayerWeights[i], testImage)+self.firstLayerBiases[i])
for i in range (self.dimensions[2]):
outputAns[i] = self.sigmoid(np.dot(self.secondLayerWeights[i], secondLayerAnsNodes)+self.secondLayerBiases[i])
return np.argmax(outputAns)
if __name__ == "__main__":
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images = train_images/255.0
test_images = test_images/255.0
neural_network = NeuralNetwork([784, 16, 10])
start_time = time.time()
for i in range (neural_network.epochs):
print ("\nEpoch", str(i+1) + "/" + str(neural_network.epochs))
neural_network.y[i]=neural_network.train(train_images, train_labels)
time = time.time() - start_time
plt.plot(neural_network.x, neural_network.y, 'b')
plt.ylabel('Error Change')
plt.xlabel('Epochs')
plt.show()
print("\n\n\nTotal Time Used")
if time/60 < 60:
print("Minutes: %s" % round((time/60),2))
else:
print("Seconds: %s" % round(time,2))
for i in range (neural_network.num_predictions):
prediction = neural_network.predict(test_images[i].flatten())
plt.grid(False)
plt.imshow(test_images[i], cmap=plt.cm.binary)
plt.title("Prediction: " + str(prediction) + " -- Actual: " + class_names[test_labels[i]] + "\n" + str(i+1) + "/" + str(neural_network.num_predictions))
plt.show()
由于某种原因,此代码不适用于更复杂的问题。误差没有得到最小化,准确性保持不变。这个确切的代码适用于 xor 问题和另一个类似的问题。当我尝试给它 MNIST 数字数据集时,它不起作用。唯一不同的是,每一层的节点较多,算法是一样的。
这可能是什么问题?
这是运行 20 个 epoch 后的图表,学习率为 0.000001,lambda 为 10。它显示了每个 epoch 的误差。 y 标签应该说错误,而不是错误更改。 https://i.stack.imgur.com/fLXzz.png
【问题讨论】:
-
我对你的损失函数感到困惑。你有
self.outputLayerErrors = np.subtract(self.outputNeurons, correct_output)。这不是损失函数;它可以是任意负数,因此似乎将其最小化会任意鼓励输出神经元始终收敛于零(因为 0 - correctOutput 是使用此损失函数可以获得的“最小”损失)。对于分类程序,您可以检查交叉熵损失。基本上,您将输出值解释为类概率,并最大化正确类的对数似然性,同时最小化其他类 -
@Nerdizzle 这不是我的损失函数,这是我的损失函数导数的一部分。该衍生品基于 Andrew Ng 的视频,链接为youtube.com/…,时间为 7:52
-
我明白了,所以
outputLayerErrors并不是真正的输出层错误;它们实际上是交叉熵损失 w.r.t 的梯度。输出层logits。有了这种理解,我检查了你的代码,看起来还不错。我建议调整学习率(给定任务的学习率太高总是会使 mlp 发散)。尝试不同的数量级(例如 0.0001、0.00001 和 0.000001)。我也会尝试增加隐藏节点的数量。 10似乎很少。我还将尝试调整超参数,看看我能做什么。 -
@Nerdizzle 非常感谢您实际查看我的代码,我将尝试调整一些常量,看看会发生什么!我添加了一些代码来绘制输出错误,一旦完成训练,我将发布它
-
我已经完成了我的实验。我整天在后台运行它们。请参阅下面我发布的答案。
标签: python machine-learning deep-learning neural-network regression