人工神经网络中反向传播的全矩阵方法答案

【问题标题】：Full-matrix approach to backpropagation in Artificial Neural Network人工神经网络中反向传播的全矩阵方法
【发布时间】：2015-10-14 15:13:48
【问题描述】：

我最近正在学习人工神经网络 (ANN)，并且基于 mini-batch 训练，我得到了一个在 Python 中工作和运行的代码。我遵循Michael Nilson's Neural Networks and Deep Learning 的书，其中为初学者逐步解释了每个算法。还有一个完整的手写数字识别代码，对我来说也很好。

但是，我试图通过将整个小批量一起传递以通过矩阵形式的反向传播进行训练来稍微调整代码。我还为此开发了一个工作代码，但是代码在运行时执行得非常慢。有什么方法可以实现基于全矩阵的方法来基于反向传播算法对网络进行小批量学习？

import numpy as np
import pandas as pd

class Network:

    def __init__(self, sizes):
        self.layers = len(sizes)
        self.sizes = sizes

        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) for y, x in zip(sizes[1:], sizes[:-1])]

    def feed_forward(self, a):
        for w, b in zip(self.weights, self.biases):
            a = sigmoid(np.dot(w,a) + b)
        return a

    # Calculate the cost derivative (Gradient of C w.r.t. 'a' - Nabla C(a))
    def cost_derivative(self, output_activation, y):
        return (output_activation - y)


    def update_mini_batch(self, mini_batch, eta):

        from scipy.linalg import block_diag 

        n = len(mini_batch)

        xs = [x for x, y in mini_batch]
        features = block_diag(*xs)

        ys = [y for x, y in mini_batch]
        responses = block_diag(*ys)

        ws = [a for a in self.weights for i in xrange(n)]

        new_list = []
        k = 0
        while (k < len(ws)):
            new_list.append(ws[k: k + n])
            k += n

        weights = [block_diag(*elems) for elems in new_list]

        bs = [b for b in self.biases for i in xrange(n)]

        new_list2 = []
        j = 0
        while (j < len(bs)):
            new_list2.append(bs[j : j + n])
            j += n

        biases = [block_diag(*elems) for elems in new_list2]

        baises_dim_1 = [np.dot(np.ones((n*b.shape[0], b.shape[0])), b) for b in self.biases]
        biases_dim_2 = [np.dot(b, np.ones((b.shape[1], n*b.shape[1]))) for b in baises_dim_1]
        weights_dim_1 = [np.dot(np.ones((n*w.shape[0], w.shape[0])), w) for w in self.weights]
        weights_dim_2 = [np.dot(w, np.ones((w.shape[1], n*w.shape[1]))) for w in weights_dim_1]

        nabla_b =  [np.zeros(b.shape) for b in biases_dim_2]
        nabla_w = [np.zeros(w.shape) for w in weights_dim_2]

        delta_b = [np.zeros(b.shape) for b in self.biases]
        delta_w = [np.zeros(w.shape) for w in self.weights]

        zs = []
        activation = features
        activations = [features]

        for w, b in zip(weights, biases):

            z = np.dot(w, activation) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)

        delta = self.cost_derivative(activations[-1], responses) * sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())

        for l in xrange(2, self.layers):
            z = zs[-l]                                                                      # the weighted input for that layer
            activation_prime = sigmoid_prime(z)                                             # the derivative of activation for the layer
            delta = np.dot(weights[-l + 1].transpose(), delta) * activation_prime           # calculate the adjustment term (delta) for that layer
            nabla_b[-l] = delta                                                             # calculate the bias adjustments - by means of using eq-BP3.
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())                  # calculate the weight adjustments - by means of using eq-BP4.

        delta_b = [self.split_cases(b, n) for b in nabla_b]
        delta_w = [self.split_cases(w, n) for w in nabla_w]

        self.weights = [w - (eta/n) * nw for w, nw in zip(self.weights, delta_w)]
        self.biases = [b - (eta/ n) * nb for b, nb in zip(self.biases, delta_b)]



    def split_cases(self, mat, mini_batch_size):
        i = 0
        j = 0
        dim1 = mat.shape[0]/mini_batch_size
        dim2 = mat.shape[1]/mini_batch_size
        sum_samples = np.zeros((dim1, dim2))
        while i < len(mat):

            sum_samples = sum_samples + mat[i: i + dim1, j : j + dim2]
            i += dim1
            j += dim2

        return sum_samples

    """Stochastic Gradient Descent for training in epochs"""
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data = None):

        n = len(training_data)

        if test_data:
            n_test = len(test_data)

        for j in xrange(epochs):
            np.random.shuffle(training_data)                                                                    # for each epochs the mini-batches are selected randomly
            mini_batches = [training_data[k: k+mini_batch_size] for k in xrange(0, n, mini_batch_size)]     # select equal sizes of mini-batches for the epochs (last mini_batch size might differ however)

            c = 1

            for mini_batch in mini_batches:
                print "Updating mini-batch {0}".format(c)
                self.update_mini_batch(mini_batch, eta)
                c += 1
            if test_data:
                print "Epoch {0}: {1}/{2}".format(j, self.evaluate(test_data), n_test)

            else:
                print "Epoch {0} completed.".format(j)

    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data]
        return (sum(int(x == y) for x, y in test_results))

    def export_results(self, test_data):
        results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data]
        k = pd.DataFrame(results)
        k.to_csv('net_results.csv')


# Global functions

## Activation function (sigmoid)
@np.vectorize
def sigmoid(z):
    return 1.0/(1.0 + np.exp(-z))

## Activation derivative (sigmoid_prime)
@np.vectorize
def sigmoid_prime(z):
    return sigmoid(z)*(1 - sigmoid(z))

【问题讨论】：

标签： python numpy neural-network backpropagation gradient-descent

【解决方案1】：

这是我的代码。在我的机器上，迭代 30 个 epoch 所需的时间从 800+ 秒减少到 200+ 秒。

由于我是 python 新手，所以我使用现成的东西。这个 sn-p 只需要 numpy 即可运行。

试一试。

def feedforward2(self, a):
    zs = []
    activations = [a]

    activation = a
    for b, w in zip(self.biases, self.weights):
        z = np.dot(w, activation) + b
        zs.append(z)
        activation = sigmoid(z)
        activations.append(activation)

    return (zs, activations)

def update_mini_batch2(self, mini_batch, eta):
    batch_size = len(mini_batch)

    # transform to (input x batch_size) matrix
    x = np.asarray([_x.ravel() for _x, _y in mini_batch]).transpose()
    # transform to (output x batch_size) matrix
    y = np.asarray([_y.ravel() for _x, _y in mini_batch]).transpose()

    nabla_b, nabla_w = self.backprop2(x, y)
    self.weights = [w - (eta / batch_size) * nw for w, nw in zip(self.weights, nabla_w)]
    self.biases = [b - (eta / batch_size) * nb for b, nb in zip(self.biases, nabla_b)]

    return

def backprop2(self, x, y):

    nabla_b = [0 for i in self.biases]
    nabla_w = [0 for i in self.weights]

    # feedforward
    zs, activations = self.feedforward2(x)

    # backward pass
    delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
    nabla_b[-1] = delta.sum(1).reshape([len(delta), 1]) # reshape to (n x 1) matrix
    nabla_w[-1] = np.dot(delta, activations[-2].transpose())

    for l in xrange(2, self.num_layers):
        z = zs[-l]
        sp = sigmoid_prime(z)
        delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp
        nabla_b[-l] = delta.sum(1).reshape([len(delta), 1]) # reshape to (n x 1) matrix
        nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose())

    return (nabla_b, nabla_w)

【讨论】：

【解决方案2】：

根据书中的原始代码，我对代码进行了细微的修改。代码如下。

import random
import numpy as np

class Network(object):

def __init__(self, sizes):
    self.num_layers = len(sizes)
    self.sizes = sizes
    self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
    self.weights = [np.random.randn(y, x)
                    for x, y in zip(sizes[:-1], sizes[1:])]

def feedforward(self, a):
    for b, w in zip(self.biases, self.weights):
        a = sigmoid(np.dot(w, a) + b)
    return a

def SGD(self, training_data, epochs, mini_batch_size, eta,
        test_data=None):
    training_data = list(training_data)
    n = len(training_data)

    if test_data:
        test_data = list(test_data)
        n_test = len(test_data)

    for j in range(epochs):
        random.shuffle(training_data)
        mini_batches = [
            training_data[k:k + mini_batch_size]
            for k in range(0, n, mini_batch_size)]
        for mini_batch in mini_batches:
            self.update_mini_batch(mini_batch, eta)
        if test_data:
            print("Epoch {} : {} / {}".format(j, self.evaluate(test_data), n_test))
        else:
            print("Epoch {} complete".format(j))

def update_mini_batch(self, mini_batch, eta):
    nabla_b = [np.zeros(b.shape) for b in self.biases]
    nabla_w = [np.zeros(w.shape) for w in self.weights]

    x_matrix_0 = [x for x, y in mini_batch]
    y_matrix_0 = [y for x, y in mini_batch]
    x_matrix = np.concatenate(x_matrix_0, axis=1)
    y_matrix = np.concatenate(y_matrix_0, axis=1)

    nabla_b, nabla_w = self.backprop(x_matrix, y_matrix)

    self.weights = [w - (eta / len(mini_batch)) * nw
                    for w, nw in zip(self.weights, nabla_w)]
    self.biases = [b - (eta / len(mini_batch)) * nb
                   for b, nb in zip(self.biases, nabla_b)]

def backprop(self, x, y):
    nabla_b = [np.zeros(b.shape) for b in self.biases]
    nabla_w = [np.zeros(w.shape) for w in self.weights]
    # feedforward
    activation = x
    activations = [x]  # list to store all the activations, layer by layer
    zs = []  # list to store all the z vectors, layer by layer
    for b, w in zip(self.biases, self.weights):
        z = np.dot(w, activation) + np.kron(b, np.ones([1, y.shape[1]]))
        zs.append(z)
        activation = sigmoid(z)
        activations.append(activation)
    # backward pass
    delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
    nabla_b[-1] = np.reshape([np.sum(nb) for nb in delta], [delta.shape[0], 1])
    for _d, _a in zip(delta.transpose(), activations[-2].transpose()):
        _d = np.reshape(_d, [len(_d), 1])
        _a = np.reshape(_a, [len(_a), 1])
        nabla_w[-1] += np.dot(_d, _a.transpose())

    for l in range(2, self.num_layers):
        z = zs[-l]
        sp = sigmoid_prime(z)
        delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp
        nabla_b[-l] = np.reshape([np.sum(nb) for nb in delta], [delta.shape[0], 1])
        for _d, _a in zip(delta.transpose(), activations[-l-1].transpose()):
            _d = np.reshape(_d, [len(_d), 1])
            _a = np.reshape(_a, [len(_a), 1])
            nabla_w[-l] += np.dot(_d, _a.transpose())
    return nabla_b, nabla_w

def cost_derivative(self, output_activations, y):
    return (output_activations - y)

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))


def sigmoid_prime(z):
    return sigmoid(z) * (1 - sigmoid(z))

【讨论】：