Tensorflow - numpy梯度检查不起作用答案

【问题标题】：Tensorflow - numpy gradient check doesnt workTensorflow - numpy梯度检查不起作用
【发布时间】：2021-06-12 13:30:11
【问题描述】：

我正在尝试通过有限差分法估计函数的梯度： finite difference method for estimating gradient

TLDR：

grad f(x) = [f(x+h)-f(x-h)]/(2h) 足够小的 h。

这也用于梯度检查阶段，以检查您在 AI 中的反向传播。

这是我的网络：

def defineModel():
  global model
  model = Sequential()
  model.add(keras.Input(shape=input_shape))
  model.add(layers.Conv2D(32, kernel_size=(3, 3), activation="relu"))
  model.add(layers.MaxPooling2D(pool_size=(2, 2)))
  model.add(layers.Conv2D(64, kernel_size=(3, 3), activation="relu"))
  model.add(layers.MaxPooling2D(pool_size=(2, 2)))
  model.add( layers.Flatten())
  model.add(layers.Dropout(0.5))
  model.add(layers.Dense(num_classes, activation="softmax"))
  model.build()
  model.summary()

这部分很好，没有错误。我刚刚在这里提到了它，所以你对我的模型有一定的了解。我在 MNIST 工作，所以一切都很简单。通过 1 个 epoch 和几行 TF 代码，我达到了 +98% 的准确率，这对于临时模型来说是相当不错的。

由于我正在进行对抗性训练，我想要我的损失相对于输入数据的梯度：

顺便说一句，我使用了平铺的想法：

如果您使用 (tile*tile) 维度的方形图块覆盖输入图像且没有重叠，您可以假设图块内的图像梯度几乎是恒定的，因此它是一个合理的近似值。但作为调试问题，在我的代码tile=1 中，我们正在计算逐像素梯度。

这就是问题所在，但我不知道在哪里！我控制了loss(x+h) 和loss(x-h) 和loss(x) 的损失几乎在很近的范围内，所以我知道这很好。我的 TF 自动反向传播也可以正常工作，我已经对其进行了测试。问题一定出在手动计算梯度的方式上。

tile=1
h=1e-4 #also tried 1e-5, 1e-6 but did not work

#A dummy function to wait
def w():
  print()
  ww=input('wait')
  print()

#This function works fine.
def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

#the problem is here. given an index suah is idx, I'm going to manipulate
#x_test[idx] and compute loss gradient with respect to this input
def estimateGrad(idx):
  y=model(np.expand_dims(x_test[idx],axis=0))
  y=np.expand_dims(y_train[idx],axis=0)
  x=np.squeeze(x_test[idx])

  

  cce = tf.keras.losses.CategoricalCrossentropy()
  grad=np.zeros((28,28))
  num=int(28/tile)

  #MNIST pictures are 28*28 pixels. Now grad is 28*28 nd.array
  #and x is input image converted to 28*28 nd.array
  for i in range(num):
    for j in range(num):

      plus=copy.deepcopy(x)
      minus=copy.deepcopy(x)
      #Now plus is x+h and minus is x-h
      plus[i*tile:(i+1)*tile , j*tile:(j+1)*tile]=  plus[i*tile:(i+1)*tile , j*tile:(j+1)*tile]+h
      minus[i*tile:(i+1)*tile , j*tile:(j+1)*tile]=  minus[i*tile:(i+1)*tile , j*tile:(j+1)*tile]-h

       
      plus=np.expand_dims(plus,axis=(0,-1))
      minus=np.expand_dims(minus,axis=(0,-1))

      #Now we pass plus and minus to model prediction in the next two lines
      plus=model(plus)
      minus=model(minus)
      
      #Since I want to find gradient of loss with respect to x, in the next 
      #two lines I will set plus=loss(x+h) and minus=loss(x-h)
      #In other words, here our finction f which we want to cumpute its grads
      #is the loss function

      plus=cce(y,plus).numpy()
      minus=cce(y,minus).numpy()

      #Applying the formola : grad=(loss(x+h)-loss(x-h))/2/h
      grad[i*tile:(i+1)*tile , j*tile:(j+1)*tile]=(plus-minus)/2/h

 #Ok now lets check our grad with TF autograd module
  x= tf.convert_to_tensor(np.expand_dims(x_test[idx], axis=0)) 
  with tf.GradientTape() as tape:
    tape.watch(x) 
    y=model(x)
    y_expanded=np.expand_dims(y_train[idx],axis=0)  
    loss=cce(y_expanded,y)

     
  
  delta=tape.gradient(loss,x)
  delta=delta.numpy()
  delta=np.squeeze(delta)

  #delta is gradients returned by TF via auto-differentiation.
  #We calculate the error and unfortunately its large
  diff=rel_error(grad,delta)

  print('diff ',diff)
  w()
  #problem : diff is very large. it should be less than 1e-4

您可以参考here获取我的完整代码。

标签： python tensorflow computer-vision mnist adversarial-machines

【解决方案1】：

我用自己的解决方案gradient 替换了梯度估计代码，代码现在可以工作了。计算错误可能很棘手。正如在底部的直方图（注意对数刻度）上所见，对于大多数像素，相对误差小于10^-4，但在梯度接近零的情况下，相对误差会爆炸。 max(rel_err) 和 mean(rel_err) 的问题在于，它们都容易受到异常像素的干扰。衡量数量级是否最相关的更好衡量标准是所有非零像素的几何平均值和中值。

进口

import tensorflow as tf
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import time
import copy

渐变

# Gradient
def gradient(f, x, h, **kwargs):
    shape = x.shape    
    x = x.ravel()
    out = np.zeros_like(x)
    hs = np.zeros_like(x)
    for i, _ in enumerate(x):
        hs *= 0.
        hs[i] = h
        out[i] = (f((x + hs).reshape(shape), **kwargs) - f((x - hs).reshape(shape), **kwargs)) / (2*h)

    return out.reshape(shape)

模型定义等

不是我的代码。

x_train=y_train=x_test=y_test=None
num_classes = 10
input_shape = (28, 28, 1)
batch_size= 128
epochs=1
h=1e-4
epsilon=1e-3
tile=1

model=None

def testImage(clean_img,adv_image,truth,pred):

    # image = np.squeeze(x_test[index])
    # # plot the sample
    # fig = plt.figure
    # plt.imshow(image, cmap='gray')
    # plt.show()

    # x= np.expand_dims(x_test[index], axis=0)
    # y=model(x)
    # print('model prediction clean example : ',np.argmax(y))
    # print('the ground truth : ',y_train[index])

    print("clean image : \n")
    fig=plt.figure
    plt.imshow(clean_img, cmap='gray')
    plt.show()

    print("adv image : \n")
    fig=plt.figure
    plt.imshow(adv_image, cmap='gray')
    plt.show()

    print('model prediction was : ',pred)
    print('truth was :',truth)

    print('\n\n')


def prepareDataset():
    global x_train,y_train,x_test,y_test
    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
    x_train = x_train.astype("float32") / 255
    x_test = x_test.astype("float32") / 255
    x_train = np.expand_dims(x_train, -1)
    x_test = np.expand_dims(x_test, -1)
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes) 


def defineModel():
    global model
    model = Sequential()
    model.add(keras.Input(shape=input_shape))
    model.add(layers.Conv2D(32, kernel_size=(3, 3), activation="relu"))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Conv2D(64, kernel_size=(3, 3), activation="relu"))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add( layers.Flatten())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(num_classes, activation="softmax"))
    model.build()
    model.summary()


def trainModel():
    global model
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)


def evalModel():
    score = model.evaluate(x_test, y_test, verbose=0)
    print("Test loss:", score[0])
    print("Test accuracy:", score[1])

def fgsm(epsilon,index):
    x = tf.convert_to_tensor(np.expand_dims(x_test[index], axis=0)) 
    with tf.GradientTape() as tape:
        tape.watch(x)

        cce = tf.keras.losses.CategoricalCrossentropy()
        y = model(x)
        y_expanded = np.expand_dims(y_train[index],axis=0)
        #y=np.squeeze(tf.transpose(y))
        loss = cce(y_expanded,y)
        delta = tape.gradient(loss,x)   
        perturbation=epsilon*tf.math.sign(delta)
        return perturbation


def w():
    print()
    ww=input('wait')
    print()


def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

估计梯度

def estimateGrad(idx):

    cce = tf.keras.losses.CategoricalCrossentropy()

    x = x_test[idx]
    y = y_test[idx]

    def f(x, y):
        y_pred = model(np.expand_dims(x,axis=0))
        return cce(y, y_pred[0])

    grad = gradient(f, x, 1e-3, y=y)

    tfx = tf.convert_to_tensor(np.expand_dims(x,axis=0)) 
    with tf.GradientTape() as tape:
        tape.watch(tfx) 
        y_pred = model(tfx)
        loss=cce(y, y_pred[0])

        delta = tape.gradient(loss, tfx)
        delta = delta.numpy()
        delta = np.squeeze(delta)

    return grad, delta

调用一切

所以从技术上讲，这是可行的。不幸的是，h < 1/1000 遇到了数值问题，所以这是我们可以通过有限差分实现的最佳结果。

prepareDataset()
trainModel()
grad, delta = estimateGrad(2)
abs_err = abs(delta - grad).max()
rel_err = (abs(delta - grad / np.clip(delta, 1e-8, np.infty)))

print('Absolute Error: ', abs_err)
print('Median Relative Error: ', np.median(rel_err))
print('Mean Relative Error: ', np.mean(rel_err))
print('Geometric Mean Relative Error: ', gmean(rel_err[rel_err > 0].ravel()))
print('Max Relative Error: ', np.max(rel_err))   

fig, ax = plt.subplots(1, 4, figsize=(20, 5))

ax[0].imshow(grad)
ax[0].set_title('Discrete gradient')

ax[1].imshow(delta)
ax[1].set_title('Analytic gradient')

ax[1].set_title('Relative errorr')
ax[2].imshow(rel_err + 1)

ax[3].set_title('Histogram log-relative error')
logbins = np.geomspace(1e-8, rel_err.max() + 1, 8)
ax[3].hist(rel_err, bins=logbins)
ax[3].set_xscale('log')
plt.show()

输出：

Absolute Error:  0.0001446546
Median Relative Error:  8.748577e-06
Mean Relative Error:  1781.9397
Geometric Mean Relative Error:  0.009761388
Max Relative Error:  53676.195

【讨论】：