[Converge] Backpropagation Algorithm

Ref: CS231n Winter 2016: Lecture 4: Backpropagation

Ref: How to implement a NN；中文翻译版本

Ref: Jacobian矩阵和Hessian矩阵

关于这部分内容，请详看链接二内容，并请自在本上手动推导。

理解 Chain Rule

[Converge] Backpropagation Algorithm

根据Chain Rule进行梯度传递：

[Converge] Backpropagation Algorithm

[Converge] Backpropagation Algorithm x = 1.37 代入1/x的导数 --> -0.53

[Converge] Backpropagation Algorithm x = 0.37 代入1的导数乘以 (-0.53) --> -0.53

[Converge] Backpropagation Algorithm x = -1, e^xx (-0.53) = e^-1x (-0.53) --> -0.2

[Converge] Backpropagation Algorithm x = 1, 1 * (-1) * (-0.2) --> 0.2

[Converge] Backpropagation Algorithm 加号则可直接传递下去

[Converge] Backpropagation Algorithm 偏导：w0是-1*0.2 = -0.2; x0是2*0.2 = 0.4

sigmoid

以下真是一个演示 sigmoid 的伟大例子：

[Converge] Backpropagation Algorithm

归纳出三个tricky：

[Converge] Backpropagation Algorithm

代码实现算法

以上部分，总归是对如下代码的理解：

# Python imports
import numpy as np # Matrix and vector computation package
import matplotlib.pyplot as plt  # Plotting library
from matplotlib.colors import colorConverter, ListedColormap # some plotting functions
from mpl_toolkits.mplot3d import Axes3D  # 3D plots
from matplotlib import cm # Colormaps
# Allow matplotlib to plot inside this notebook

# Set the seed of the numpy random number generator so that the tutorial is reproducable
np.random.seed(seed=1)

# Define and generate the samples
nb_of_samples_per_class = 20  # The number of sample in each class
blue_mean      = [0]  # The mean of the blue class
red_left_mean  = [-2]  # The mean of the red class
red_right_mean = [2]  # The mean of the red class
std_dev        = 0.5  # standard deviation of both classes

# Generate samples from both classes
x_blue      = np.random.randn(nb_of_samples_per_class,   1) * std_dev + blue_mean
x_red_left  = np.random.randn(nb_of_samples_per_class/2, 1) * std_dev + red_left_mean
x_red_right = np.random.randn(nb_of_samples_per_class/2, 1) * std_dev + red_right_mean

# Merge samples in set of input variables x, and corresponding set of
# output variables t
x = np.vstack((x_blue, x_red_left, x_red_right))
t = np.vstack((np.ones((x_blue.shape[0],1)), 
               np.zeros((x_red_left.shape[0],1)), 
               np.zeros((x_red_right.shape[0], 1))))
# 已备齐数据
###############################################################################

# Plot samples from both classes as lines on a 1D space
plt.figure(figsize=(8,0.5))
plt.xlim(-3,3)
plt.ylim(-1,1)
# Plot samples
plt.plot(x_blue, np.zeros_like(x_blue), 'b|', ms = 30) 
plt.plot(x_red_left, np.zeros_like(x_red_left), 'r|', ms = 30) 
plt.plot(x_red_right, np.zeros_like(x_red_right), 'r|', ms = 30) 
plt.gca().axes.get_yaxis().set_visible(False)
plt.title('Input samples from the blue and red class')
plt.xlabel('$x$', fontsize=15)
plt.show()

###############################################################################

# Define the rbf function
def rbf(z):
    return np.exp(-z**2)

# Plot the rbf function
z = np.linspace(-6,6,100)
plt.plot(z, rbf(z), 'b-')
plt.xlabel('$z$', fontsize=15)
plt.ylabel('$e^{-z^2}$', fontsize=15)
plt.title('RBF function')
plt.grid()
plt.show()

###############################################################################

# Define the logistic function
def logistic(z): 
    return 1 / (1 + np.exp(-z))

# Function to compute the hidden activations
def hidden_activations(x, wh):
    return rbf(x * wh)

# Define output layer feedforward
def output_activations(h , wo):
    return logistic(h * wo - 1)

# Define the neural network function
def nn(x, wh, wo): 
    return output_activations(hidden_activations(x, wh), wo)

# Define the neural network prediction function that only returns
#  1 or 0 depending on the predicted class
def nn_predict(x, wh, wo): 
    return np.around(nn(x, wh, wo))
    
###############################################################################

# Define the cost function
def cost(y, t):
    return - np.sum(np.multiply(t, np.log(y)) + np.multiply((1-t), np.log(1-y)))

# Define a function to calculate the cost for a given set of parameters
def cost_for_param(x, wh, wo, t):
    return cost(nn(x, wh, wo) , t)
    
###############################################################################

# Plot the cost in function of the weights
# Define a vector of weights for which we want to plot the cost
nb_of_ws = 200 # compute the cost nb_of_ws times in each dimension
wsh = np.linspace(-10, 10, num=nb_of_ws) # hidden weights
wso = np.linspace(-10, 10, num=nb_of_ws) # output weights
ws_x, ws_y = np.meshgrid(wsh, wso) # generate grid
cost_ws = np.zeros((nb_of_ws, nb_of_ws)) # initialize cost matrix

# Fill the cost matrix for each combination of weights
for i in range(nb_of_ws):
    for j in range(nb_of_ws):
        cost_ws[i,j] = cost(nn(x, ws_x[i,j], ws_y[i,j]) , t)　　# 画权值对应的cost等高图，很好的表现方式

# Plot the cost function surface
fig = plt.figure()
ax = Axes3D(fig)
# plot the surface
surf = ax.plot_surface(ws_x, ws_y, cost_ws, linewidth=0, cmap=cm.pink)
ax.view_init(elev=60, azim=-30)
cbar = fig.colorbar(surf)
ax.set_xlabel('$w_h$',  fontsize=15)
ax.set_ylabel('$w_o$',  fontsize=15)
ax.set_zlabel('$\\xi$', fontsize=15)
cbar.ax.set_ylabel('$\\xi$', fontsize=15)
plt.title('Cost function surface')
plt.grid()
plt.show()
    
###############################################################################

# Define the error function
def gradient_output(y, t):
    return y - t

# Define the gradient function for the weight parameter at the output layer
def gradient_weight_out(h, grad_output): 
    return  h * grad_output

# Define the gradient function for the hidden layer
def gradient_hidden(wo, grad_output):
    return wo * grad_output

# Define the gradient function for the weight parameter at the hidden layer
def gradient_weight_hidden(x, zh, h, grad_hidden):
    return x * -2 * zh * h * grad_hidden

# Define the update function to update the network parameters over 1 iteration
def backprop_update(x, t, wh, wo, learning_rate):
    # Compute the output of the network
    # This can be done with y = nn(x, wh, wo), but we need the intermediate 
    #  h and zh for the weight updates.
    zh = x * wh
    h = rbf(zh)  # hidden_activations(x, wh)
    y = output_activations(h, wo)
    # 以上是正向计算出output的过程    
# Compute the gradient at the output
    grad_output = gradient_output(y, t)　　#计算cost 

# Get the delta for wo
    d_wo = learning_rate * gradient_weight_out(h, grad_output)  # <-- 计算w₀的改变量
    
    # Compute the gradient at the hidden layer
    grad_hidden = gradient_hidden(wo, grad_output)
    # Get the delta for wh
    d_wh = learning_rate * gradient_weight_hidden(x, zh, h, grad_hidden)    # <-- 计算w_h的改变量
    
    # return the update parameters
    return (wh-d_wh.sum(), wo-d_wo.sum())　　# 减小cost，返回更新后的权值对

###############################################################################

# Run backpropagation
# Set the initial weight parameter
wh = 2
wo = -5

# Set the learning rate
learning_rate = 0.2

# Start the gradient descent updates and plot the iterations
nb_of_iterations = 50  # number of gradient descent updates
lr_update   = learning_rate / nb_of_iterations # learning rate update rule 设置学习率每次减小的量
w_cost_iter = [(wh, wo, cost_for_param(x, wh, wo, t))]  # List to store the weight values over the iterations

for i in range(nb_of_iterations):
    learning_rate -= lr_update   # decrease the learning rate 学习率在不断的减小

    # Update the weights via backpropagation
    wh, wo = backprop_update(x, t, wh, wo, learning_rate)  # 参数是旧权值，返回了新权值
    w_cost_iter.append((wh, wo, cost_for_param(x, wh, wo, t)))  # Store the values for plotting

# 通过打印w_cost_iter查看迹线　　----> 见【result】
# Print the final cost
print('final cost is {:.2f} for weights wh: {:.2f} and wo: {:.2f}'.format(cost_for_param(x, wh, wo, t), wh, wo))

###############################################################################

# Plot the weight updates on the error surface
# Plot the error surface
fig  = plt.figure()
ax   = Axes3D(fig)
surf = ax.plot_surface(ws_x, ws_y, cost_ws, linewidth=0, cmap=cm.pink)
ax.view_init(elev=60, azim=-30)
cbar = fig.colorbar(surf)
cbar.ax.set_ylabel('$\\xi$', fontsize=15)

# Plot the updates
for i in range(1, len(w_cost_iter)):
    wh1, wo1, c1 = w_cost_iter[i-1]
    wh2, wo2, c2 = w_cost_iter[i]
    # Plot the weight-cost value and the line that represents the update 
    ax.plot([wh1], [wo1], [c1], 'w+')  # Plot the weight cost value
    ax.plot([wh1, wh2], [wo1, wo2], [c1, c2], 'w-')
# Plot the last weights
wh1, wo1, c1 = w_cost_iter[len(w_cost_iter)-1]
ax.plot([wh1], [wo1], c1, 'w+')
# Shoz figure
ax.set_xlabel('$w_h$', fontsize=15)
ax.set_ylabel('$w_o$', fontsize=15)
ax.set_zlabel('$\\xi$', fontsize=15)
plt.title('Gradient descent updates on cost surface')
plt.grid()
plt.show()

Result: 学习率不同

[Converge] Backpropagation Algorithm

再添加一层隐藏层，如下，推导后可见递推过程：

[Converge] Backpropagation Algorithm

“多分类” Softmax代码实现

Intermezzo 2: Softmax classification function

# Python imports
import numpy as np # Matrix and vector computation package
import matplotlib.pyplot as plt  # Plotting library
from matplotlib.colors import colorConverter, ListedColormap # some plotting functions
from mpl_toolkits.mplot3d import Axes3D  # 3D plots
from matplotlib import cm # Colormaps
# Allow matplotlib to plot inside this notebook

###############################################################################

# Define the softmax function
def softmax(z):
    return np.exp(z) / np.sum(np.exp(z))
    
###############################################################################

# Plot the softmax output for 2 dimensions for both classes
# Plot the output in function of the weights
# Define a vector of weights for which we want to plot the ooutput
nb_of_zs = 200
zs = np.linspace(-10, 10, num=nb_of_zs) # input 
zs_1, zs_2 = np.meshgrid(zs, zs) # generate grid
# 200*200的矩阵

y = np.zeros((nb_of_zs, nb_of_zs, 2)) # initialize output

# Fill the output matrix for each combination of input z's
for i in range(nb_of_zs):
    for j in range(nb_of_zs):
        y[i,j,:] = softmax( np.asarray( [zs_1[i,j], zs_2[i,j]] ) )
                   # Grid上的某个像素点的坐标值天然地代表两个值
                   # 将两值通过softmax转换后获得对比结果        
###############################################################################
        
# Plot the cost function surfaces for both classes
fig = plt.figure()
# Plot the cost function surface for t=1
ax = fig.gca(projection='3d')
surf = ax.plot_surface(zs_1, zs_2, y[:,:,0], linewidth=0, cmap=cm.coolwarm)
ax.view_init(elev=30, azim=70)
cbar = fig.colorbar(surf)
ax.set_xlabel('$z_1$', fontsize=15)
ax.set_ylabel('$z_2$', fontsize=15)
ax.set_zlabel('$y_1$', fontsize=15)
ax.set_title ('$P(t=1|\mathbf{z})$')
cbar.ax.set_ylabel('$P(t=1|\mathbf{z})$', fontsize=15)
plt.grid()
plt.show()

###############################################################################

Result:

[Converge] Backpropagation Algorithm

注解：

zs_1
Out[49]: 
array([[-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       ..., 
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ]])

zs_2
Out[50]: 
array([[-10.        , -10.        , -10.        , ..., -10.        ,
        -10.        , -10.        ],
       [ -9.89949749,  -9.89949749,  -9.89949749, ...,  -9.89949749,
         -9.89949749,  -9.89949749],
       [ -9.79899497,  -9.79899497,  -9.79899497, ...,  -9.79899497,
         -9.79899497,  -9.79899497],
       ..., 
       [  9.79899497,   9.79899497,   9.79899497, ...,   9.79899497,
          9.79899497,   9.79899497],
       [  9.89949749,   9.89949749,   9.89949749, ...,   9.89949749,
          9.89949749,   9.89949749],
       [ 10.        ,  10.        ,  10.        , ...,  10.        ,
         10.        ,  10.        ]])

zs_1, zs_2