示意图

BP神经网络推导

符号说明

y0:yRs0×1zl:lz(l)Rsl×1yl:ly(l)Rsl×1σ:sl:ly(l)z(l)t:L:Lfil:yilzilI(i):i10;δil:Eyilδl:Eyl(δ1l,δ2l, ,δsll) \begin{aligned} \boldsymbol{y}^{0}: & 输入,\boldsymbol{y}\in \mathbb{R}^{s0\times1} \\ \boldsymbol{z}^{l}: &第l层输出\boldsymbol{z}^{(l)} \in \mathbb{R}^{sl \times 1} \\ \boldsymbol{y}^{l}:&第l层输出\boldsymbol{y}^{(l)} \in \mathbb{R}^{sl \times 1} \\ \boldsymbol{\sigma}:&**函数\\ sl:& 表示l层 \boldsymbol{y}^{(l)} \boldsymbol{z}^{(l)}的向量维数 \\ \boldsymbol{t}: &表示真实值 \\ L:&一共L层 \\ f^l_{i}:& 表示\frac{\partial{y^l_i}}{\partial{z^l_i}} \\ \boldsymbol{I}(i):&表示为列向量,且在第i行为1,其余位置为0; \\ \delta^l_i: &表示 \frac{\partial{E}}{\partial{y^l_i}} \\ \boldsymbol{\delta}^l: &表示\frac{\partial{E}}{\partial{\boldsymbol{y}^l}} , 即为: \begin{pmatrix} \delta^l_1 ,\delta^l_2 ,\cdots,\delta^l_{sl} \end{pmatrix} \end{aligned}
它们之间的关系:
zl=wlyl1zil=j=1s(l1)wijyjl1yl=σ(yl)yil=σ(zil)f(x) \begin{aligned} \boldsymbol{z}^{l} &=\boldsymbol{w}^{l}*\boldsymbol{y}^{l-1}\\ z^{l}_i &= \sum_{j=1}^{s(l-1)}w_{ij}*y^{l-1}_j \\ \boldsymbol{y}^{l} &=\boldsymbol{\sigma}(\boldsymbol{y}^{l}) \\ y^{l}_i &= \sigma(z^l_i) \end{aligned} f(\boldsymbol{x})

矩阵相关求导说明

符号说明
y:,yRn×1\boldsymbol{y}:列向量,\boldsymbol{y} \in \mathbb{R}^{n \times 1}
x:,xRm×1\boldsymbol{x}:列向量,\boldsymbol{x} \in \mathbb{R}^{m \times 1}
f(x):f:RmRf(\boldsymbol{x}):实值标量函数,记做 f: \mathbb{R}^m \to \mathbb{R}
公式
yTx=(y1x1ynx1y1xmynxm)yTy=En×nf(x)x=[f(x)x1, ,f(x)xm]T\begin{aligned} \frac{\partial{\boldsymbol{y}^ \mathrm{T}}}{\partial{\boldsymbol{x}}} &= \begin{pmatrix} \frac{\partial{y_1}}{\partial{x_1}} & \cdots & \frac{\partial{y_n}}{\partial{x_1}} \\ \vdots & & \vdots \\ \frac{\partial{y_1}}{\partial{x_m}} & \cdots & \frac{\partial{y_n}}{\partial{x_m}} \end{pmatrix} \\ \frac{\partial{\boldsymbol{y}^ \mathrm{T}}}{\partial{\boldsymbol{y}}} &= \mathbf{E}_{n \times n} \\ \frac{f(\boldsymbol{x})}{\partial{\boldsymbol{x}}} &= [ \frac{f(\boldsymbol{x})}{\partial{x_1}} , \cdots ,\frac{f(\boldsymbol{x})}{\partial{x_m}}]^{\mathrm{T}} \end{aligned}

公式推导

误差定义
E=1mp=1m(Ep)Ep=12(yLtL)2=12i=1sL(yiLti)2 \begin{aligned} E &=\frac{1}{m}\sum_{p=1}^{m}(E_p) \\ E_p &= \frac{1}{2}(\boldsymbol{y}^L - \boldsymbol{t}^L)^2 \\ &=\frac{1}{2}\sum_{i=1}^{sL}(y^L_i - t_i)^2 \end{aligned}

其中m为样本数,为了推导简单,让m=1
EwijL\frac{\partial{E}}{\partial{w^L_{ij}}}
几点说明
zklwijl={zjl1z=i0kizlwij=[z1lwijl, ,zsllwijl]TRsl×1=I(i).zil1yl(zl)T=(y1lz1ly1lzsllysllz1lysllzsll)Rsl×sl=(f1lf2lf(sl)l)zl(yl1)T=(z1ly1l1z1lys(l1)l1zslly1l1zsllys(l1)(l1))=(w11lw(s(l1))1lw(sl)1lw(sl)(s(l1))l)Rsl×s(l1)ylyl1=ylzl.zlyl1=yl(zl)T.(zl)Tzl.zl(yl1)T.(yl1)Tyl1=yl(zl)T.zl(yl1)T=(f1lw11lf1lw(s(l1))1fsllw(sl)1lfsllw(sl)(s(l1))l)Rsl×s(l1)E(yL)T=[y1Lt1, ,ysLLtsL]zlzil1=[w1il,w2il, ,w(sl)il]Tylyil1=ylzl.zlyil1=(f1lf2lf(sl)l).(w1ilw2ilw(sl)il)=(f1lw1ilf2lw2ilf(sl)lw(sl)il) \begin{aligned} \frac{\partial{z_k^l}}{\partial{w^l_{ij}}} &= \begin{cases} z^{l-1}_j & z = i \\ 0 & k \ne i \end{cases} \\ \frac{\partial{\boldsymbol{z}^l } }{\partial{w_{ij}}} &= [\frac{\partial{z_1^l}}{\partial{w_{ij}^l}} ,\cdots,\frac{\partial{z_{sl}^l}}{\partial{w_{ij}^l}}]^{\mathrm{T}} \in \mathbb{R}^{sl \times 1} \\ &=\boldsymbol{I}(i).z_i^{l-1} \\ \\ \frac{\partial{\boldsymbol{y}^l}}{\partial{(\boldsymbol{z}^{l})^{\mathrm{T}}}} &= \begin{pmatrix} \frac{\partial{y_1^l}}{\partial{z_1^l}}& \cdots & \frac{\partial{y_1^l}}{\partial{z_{sl}^l}} \\ \vdots & & \vdots \\ \frac{\partial{y_{sl}^l}}{\partial{z_1^l}}& \cdots & \frac{\partial{y_{sl}^l}}{\partial{z_{sl}^l}} \end{pmatrix} \in \mathbb{R}^{sl \times sl} \\ &= \begin{pmatrix} f^l_1& & & \\ &f^l_2 \\ & &\ddots \\ & & &f^l_{(sl)} \end{pmatrix} \\ \\ \frac{\partial{\boldsymbol{z}^{l}}}{\partial{(\boldsymbol{y}^{l-1})^{\mathrm{T}}}}&= \begin{pmatrix} \frac{\partial{z_1^l}}{\partial{y_1^{l-1}}}& \cdots & \frac{\partial{z_1^l}}{\partial{y_{s(l-1)}^{l-1}}} \\ \vdots & & \vdots \\ \frac{\partial{z_{sl}^l}}{\partial{y_1^{l-1}}}& \cdots & \frac{\partial{z_{sl}^l}}{\partial{y_{s(l-1)}^{(l-1)}}} \end{pmatrix} \\ &= \begin{pmatrix} w_{11}^l& \cdots & w_{(s(l-1))1}^l \\ \vdots & & \vdots \\ w_{(sl)1}^l& \cdots & w_{(sl)(s(l-1))}^l \end{pmatrix} \in \mathbb{R}^{sl \times s(l-1)} \\ \\ \frac{\partial{\boldsymbol{y}^l}}{\partial{\boldsymbol{y}^{l-1}}} &= \frac{\partial{\boldsymbol{y}^l}}{\partial{\boldsymbol{z}^{l}}} .\frac{\partial{\boldsymbol{z}^l}}{\partial{\boldsymbol{y}^{l-1}}} \\ &=\frac{\partial{\boldsymbol{y}^l}}{\partial{(\boldsymbol{z}^{l})^{\mathrm{T}}}} . \frac{\partial{(\boldsymbol{z}^{l})^{\mathrm{T}}}}{\partial{\boldsymbol{z}^{l}}} . \frac{\partial{\boldsymbol{z}^{l}}}{\partial{(\boldsymbol{y}^{l-1})^{\mathrm{T}}}} . \frac{\partial{(\boldsymbol{y}^{l-1})^{\mathrm{T}}}}{\partial{\boldsymbol{y}^{l-1}}} \\ &= \frac{\partial{\boldsymbol{y}^l}}{\partial{(\boldsymbol{z}^{l})^{\mathrm{T}}}} .\frac{\partial{\boldsymbol{z}^{l}}}{\partial{(\boldsymbol{y}^{l-1})^{\mathrm{T}}}} \\ &= \begin{pmatrix} f^l_1w_{11}^l & \cdots & f^l_1w_{(s(l-1))1} \\ \vdots & & \vdots \\ f^l_{sl}w_{(sl)1}^l& \cdots & f^l_{sl}w_{(sl)(s(l-1))}^l \end{pmatrix} \in \mathbb{R^{sl \times s(l-1)}} \\ \\ \frac{\partial{E}}{\partial{(\boldsymbol{y}^L)^{\mathrm{T}}}} &=[y^L_1 - t_1,\cdots, y^L_{sL} - t_{sL}] \\ \\ \frac{\partial{\boldsymbol{z}^l}}{\partial{z^{l-1}_i}} &=[w^l_{1i},w^l_{2i},\cdots,w^l_{(sl)i}]^{\mathrm{T}} \\ \\ \frac{\partial{\boldsymbol{y}^l}}{\partial{y^{l-1}_i}} &= \frac{\partial{\boldsymbol{y}^l}}{\partial{\boldsymbol{z}^l}}. \frac{\partial{\boldsymbol{z}^l}}{\partial{y^{l-1}_i}} \\ &= \begin{pmatrix} f^l_1& & & \\ &f^l_2 \\ & &\ddots \\ & & &f^l_{(sl)} \end{pmatrix}. \begin{pmatrix} w^l_{1i} \\ w^l_{2i} \\ \cdots \\ w^l_{(sl)i} \end{pmatrix} \\ &= \begin{pmatrix} f^l_1w^l_{1i} \\ f^l_2 w^l_{2i} \\ \vdots \\ f^l_{(sl)} w^l_{(sl)i} \end{pmatrix} \end{aligned}

求解
EwijL=EyL.yLwijL=E(yL)T.(yL)TyL.yLwijL.=(y1Lt1, ,ysLLtsL).I(i).ziL1EwijL1=EyL.yLyL1.yL1wijL1=(y1Lt1, ,ysLLtsL).(f1Lw11Lf1Lw(s(L1))1fsLLw(sL)1LfsLLw(sL)(s(L1))L).I(i).zil1=k=1sl(ykLtk)f1LwkiLzjL1 \begin{aligned} \frac{\partial{E}}{\partial{w^L_{ij}}} &= \frac{\partial{E}}{\partial{\boldsymbol{y}^L}} . \frac{\partial{\boldsymbol{y}^L}}{\partial{w^L_{ij}}} \\ &=\frac{\partial{E}}{\partial{(\boldsymbol{y}^L)^{\mathrm{T}}}} . \frac{\partial{(\boldsymbol{y}^L)^{\mathrm{T}}}}{\partial{\boldsymbol{y}^L}} . \frac{\partial{\boldsymbol{y}^L}}{\partial{w^L_{ij}}} . \\ &= \begin{pmatrix} y^L_1 - t_1,\cdots, y^L_{sL} - t_{sL} \end{pmatrix} . \boldsymbol{I}(i).z_i^{L-1} \\ \\ \frac{\partial{E}}{\partial{w^{L-1}_{ij}}} &= \frac{\partial{E}}{\partial{\boldsymbol{y}^L}} . \frac{\partial{\boldsymbol{y}^L}}{\partial{\boldsymbol{y}^{L-1}}}. \frac{\partial{\boldsymbol{y}^{L-1}}}{\partial{w^{L-1}_{ij}}} \\ &= \begin{pmatrix} y^L_1 - t_1,\cdots, y^L_{sL} - t_{sL} \end{pmatrix} . \begin{pmatrix} f^L_1w_{11}^L & \cdots & f^L_1w_{(s(L-1))1} \\ \vdots & & \vdots \\ f^L_{sL}w_{(sL)1}^L& \cdots & f^L_{sL}w_{(sL)(s(L-1))}^L \end{pmatrix} . \boldsymbol{I}(i).z_i^{l-1} \\ &= \sum_{k=1}^{sl}(y_k^L-t_k) f_1^L w^L_{ki}z^{L-1}_j \end{aligned}
另一种定义方法
EwijL=EyiL.yiLwijL=(yiLhi)ziL1EwijL1=EyL.yLyiL1.yiL1wijL1=(y1Lt1, ,ysLLtsL).(f1Lw1iLf2Lw2iLf(sL)Lw(sL)iL).ziL1=k=1sl(ykLtk)f1LwkiLzjL1δil1=Eyil1=Eyl.ylyil1=(δ1l,δ2l, ,δsll).(f1lw1ilf2lw2ilf(sl)lw(sl)il)=k=1slδklfklw(sl)ilEwijL=EyiL.yiLwijL=δiLziL1=(yiLhi)ziL1Ewijl=Eyl.yilwijl=δilzil1=k=1s(l+1)δk(l+1)fkl+1wkil+1zjl1\begin{aligned} \frac{\partial{E}}{\partial{w^L_{ij}}} &= \frac{\partial{E}}{\partial{y^L_i}} . \frac{\partial{y^L_i}}{\partial{w^L_{ij}}} \\ &= (y^L_i - h_i) z_i^{L-1} \\ \\ \frac{\partial{E}}{\partial{w^{L-1}_{ij}}} &= \frac{\partial{E}}{\partial{\boldsymbol{y}^L}} . \frac{\partial{\boldsymbol{y}^L}}{\partial{y^{L-1}_i}}. \frac{\partial{y^{L-1}_i}}{\partial{w^{L-1}_{ij}}} \\ &= \begin{pmatrix} y^L_1 - t_1,\cdots, y^L_{sL} - t_{sL} \end{pmatrix} . \begin{pmatrix} f^L_1w^L_{1i} \\ f^L_2 w^L_{2i} \\ \vdots \\ f^L_{(sL)} w^L_{(sL)i} \end{pmatrix} .z_i^{L-1} \\ &= \sum_{k=1}^{sl}(y_k^L-t_k) f_1^L w^L_{ki}z^{L-1}_j \\ \\ \delta^{l-1}_i&=\frac{\partial{E}}{\partial{y^{l-1}_i}} \\ &=\frac{\partial{E}}{\partial{\boldsymbol{y}^{l}}}. \frac{\partial{\boldsymbol{y}^{l}}}{\partial{y^{l-1}_i}} \\ &= \begin{pmatrix} \delta^l_1 ,\delta^l_2 ,\cdots,\delta^l_{sl} \end{pmatrix}. \begin{pmatrix} f^l_1w^l_{1i} \\ f^l_2 w^l_{2i} \\ \vdots \\ f^l_{(sl)} w^l_{(sl)i} \end{pmatrix} \\ &=\sum_{k=1}^{sl}\delta^l_k f^l_k w^l_{(sl)i} \\ \\ \frac{\partial{E}}{\partial{w^L_{ij}}} &= \frac{\partial{E}}{\partial{y^L_i}} . \frac{\partial{y^L_i}}{\partial{w^L_{ij}}} \\ &= \delta^L_i z_i^{L-1} \\ &= (y^L_i - h_i) z_i^{L-1} \\ \\ \frac{\partial{E}}{\partial{w^{l}_{ij}}} &= \frac{\partial{E}}{\partial{\boldsymbol{y}^l}} . \frac{\partial{y^{l}_i}}{\partial{w^{l}_{ij}}} \\ &=\delta^{l}_i z_i^{l-1} \\ &=\sum_{k=1}^{s(l+1)} \delta^{(l+1)}_k f^{l+1}_k w^{l+1}_{ki}z_j^{l-1} \end{aligned}

相关文章: