【问题标题】:Training Accuracy is Very Low in A Simple CNN using Theano使用 Theano 的简单 CNN 中的训练精度非常低
【发布时间】:2018-01-25 19:00:18
【问题描述】:

我正在尝试使用 Theano 实现 CNN,并尝试使用较大数据集的小样本集来测试我的代码。我试图将一组 8280 张图片(250*250 尺寸)分类为 115 个类,我的样本集是前两个类的 32 张图片(每个类 16 张图片)。我遇到的问题是,从第一个时期开始,NaN 的训练损失在以后的时期不会改变。

from __future__ import print_function

import sys
import os
import time

import numpy as np
import theano
import theano.tensor as T

import lasagne
import re
import cv2
from lasagne.layers import Conv2DLayer, MaxPool2DLayer , DropoutLayer
from lasagne.layers import InputLayer, DenseLayer, batch_norm

def split_list(a_list):
    half = len(a_list)/2
    return a_list[:half], a_list[half:]

def load_dataset(path=''):
    cat_list = []
    filelist = sorted(os.listdir(path))
    trainlist = []
    testlist = []
    tmptrain = []
    tmptest = []
    max_id = 0
    for f in filelist:
        match = re.match(r'C(\d+)([F|G])(\d+)\.PNG', f)
        id = int(match.group(1)) - 1
        max_id = max(max_id,id)
        fg_class = match.group(2)
        fg_id = int(match.group(3))
        if id not in [p[0] for p in cat_list]:
            cat_list.append([id, [], []])
        if fg_class == 'G':
            cat_list[-1][1].append(f)
        else:
            cat_list[-1][2].append(f)
    for f in cat_list:
        id = f[0]
        trainG, testG = split_list(f[1])
        trainF, testF = split_list(f[2])
        tmptrain = tmptrain + [(id, 1, F) for F in trainF] + [(id, 0, G) for G in trainG]  # (Class_id,Forgery,Img)
        tmptest = tmptest + [(id, 1, F) for F in testF] + [(id, 0, F) for F in testG]
    X_train = np.array([cv2.imread(path+f[2],0) for f in tmptrain]).astype(np.int32)
    y_train = np.array([f[0] for f in tmptrain]).astype(np.int32)
    X_test = np.array([cv2.imread(path+f[2],0) for f in tmptest]).astype(np.int32)
    y_test = np.array([f[0] for f in tmptest]).astype(np.int32)
    fg_train = np.array([f[1] for f in tmptrain]).astype(np.int32)
    fg_test = np.array([f[1] for f in tmptest]).astype(np.int32)

    X_train = np.expand_dims(X_train,axis=1).astype(np.int32)
    X_test = np.expand_dims(X_test, axis=1).astype(np.int32)

    return X_train, y_train, X_test, y_test, fg_train , fg_test


def ExplicitNegativeCorrelation(net,layer='fc2',lr=0.00001):
    for param in lasagne.layers.get_all_params(net[layer]):
        if param.name.startswith('W'):
            W = param
            mean = T.mean(W,0) * lr
            W = W - mean#T.mean(T.mean(W,0))
def ImplicitNegativeCorrelation(MSE,Cross,Hinge):
    mean = T.mean((MSE+Cross+Hinge),axis=0)
    return ((MSE-mean)**2+(Cross-mean)**2+(Hinge-mean)**2)/3

def build_cnn(inputvar,input_shape, trained_weights=None):

    net = {}

    net['input'] = InputLayer(input_shape,input_var=inputvar)
    net['drop_input'] = DropoutLayer(net['input'],p=0.2)
    net['conv1'] = batch_norm(Conv2DLayer(net['input'], num_filters=96, filter_size=11, stride=4, flip_filters=False))#,W=lasagne.init.HeNormal()))
    net['pool1'] = MaxPool2DLayer(net['conv1'], pool_size=3, stride=2)

    net['conv2'] = batch_norm(Conv2DLayer(net['pool1'], num_filters=256, filter_size=5, pad=2, flip_filters=False))#, W=lasagne.init.HeNormal()))
    net['pool2'] = MaxPool2DLayer(net['conv2'], pool_size=3, stride=2)

    net['conv3'] = batch_norm(Conv2DLayer(net['pool2'], num_filters=384, filter_size=3, pad=1, flip_filters=False))#, W=lasagne.init.HeNormal()))
    net['conv4'] = batch_norm(Conv2DLayer(net['conv3'], num_filters=384, filter_size=3, pad=1, flip_filters=False))#, W=lasagne.init.HeNormal()))
    net['conv5'] = batch_norm(Conv2DLayer(net['conv4'], num_filters=256, filter_size=3, pad=1, flip_filters=False))#, W=lasagne.init.HeNormal()))
    net['pool5'] = MaxPool2DLayer(net['conv5'], pool_size=3, stride=2)

    net['fc1'] = batch_norm(DenseLayer(net['pool5'], num_units=2048))
    net['drop_fc1'] = DropoutLayer(net['fc1'])

    net['fc2'] = batch_norm(DenseLayer(net['drop_fc1'], num_units=2048))
    net['fc_class'] = batch_norm(DenseLayer(net['fc2'],num_units=115))

    return net



def iterate_minibatches(inputs, targets_class,targets_verif, batchsize, shuffle=False):
    assert len(inputs) == len(targets_class)
    assert len(inputs) == len(targets_verif)
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets_class[excerpt], targets_verif[excerpt]





def main(num_epochs=500):

    print("Loading data...")
    X_train, y_train, X_test, y_test, fg_train, fg_test = load_dataset('./signatures/tmp4/')
    X_val, y_val, fg_val = X_train, y_train, fg_train
    print(y_train.shape)

    input_var = T.tensor4('inputs')
    target_var_class = T.ivector('targets')

    network = build_cnn(input_var, (None, 1, 250, 250))

    class_prediction = lasagne.layers.get_output(network['fc_class'])  # ,inputs={network['input']:input_var})
    loss_class = lasagne.objectives.categorical_crossentropy(class_prediction, target_var_class)

    loss = loss_class.mean()
    params = lasagne.layers.get_all_params([network['fc_class']], trainable=True)


    lr = 0.01
    updates = lasagne.updates.nesterov_momentum(
        loss, params, learning_rate=lr, momentum=0.9)

    test_prediction_class = lasagne.layers.get_output(network['fc_class'], deterministic=True)
    test_loss_class = lasagne.objectives.categorical_crossentropy(test_prediction_class,
                                                            target_var_class)
    test_loss_class = test_loss_class.mean()
    test_acc_class = T.mean(T.eq(T.argmax(test_prediction_class, axis=1), target_var_class),
                      dtype=theano.config.floatX)


    predict_class = theano.function([input_var], T.argmax(test_prediction_class,axis=1))

    train_fn = theano.function([input_var, target_var_class], loss, updates=updates)

    val_fn_class = theano.function([input_var, target_var_class], [test_loss_class, test_acc_class])

    print("Starting training...")
    BatchSize = 2
    for epoch in range(num_epochs):
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train,fg_train, BatchSize, shuffle=True):
            inputs, targets_class, targets_verif = batch
            train_err += train_fn(inputs, targets_class)
            #ExplicitNegativeCorrelation(network, layer='fc2',lr=lr/10)
            print(targets_class,predict_class(inputs))
            train_batches += 1

        val_err_class = 0
        val_acc_class = 0

        val_batches = 0
        for batch in iterate_minibatches(X_val, y_val, fg_val, BatchSize, shuffle=False):
            inputs, targets_class, targets_verif = batch
            err_class, acc_class = val_fn_class(inputs, targets_class)
            val_err_class += err_class
            val_acc_class += acc_class
            val_batches += 1

        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  Classification loss:\t\t{:.6f}".format(val_err_class / val_batches))
        print("  Classification accuracy:\t\t{:.2f} %".format(
            val_acc_class / val_batches * 100))

    test_err_class = 0
    test_acc_class = 0
    test_err_verif = 0
    test_acc_verif = 0
    test_batches = 0
    for batch in iterate_minibatches(X_test, y_test, fg_test, BatchSize, shuffle=False):
        inputs, targets_class, targets_verif = batch
        err_class, acc_class = val_fn_class(inputs, targets_class)
        test_err_class += err_class
        test_acc_class += acc_class
        test_batches += 1
    print("Final results:")
    print("  test loss (Classification):\t\t\t{:.6f}".format(test_err_class / test_batches))
    print("  test accuracy (Classification):\t\t{:.2f} %".format(
        test_acc_class / test_batches * 100))

if __name__ == '__main__':
   main()

我尝试将lasagne.nonlinearities.softmax 放在 DenseLayers 中,但它确实解决了 NaN 问题,但训练模型的准确性不会有任何好处,它会在 0 到 25% 之间波动。(经过 50 个 epoch! )。

我已经实现了一个我认为可以正常工作的load_dataset 函数(我已经多次测试了该函数),并且我将每张图片的类ID 作为损失函数中的目标。所以我的输入和目标是这样的:

Input Shape: (BatchSize, 1, 250, 250) 
Target Shape: (BatchSize, 1) : vector of class ids

我已经在this link 上传了我的样本集。

【问题讨论】:

    标签: deep-learning theano conv-neural-network


    【解决方案1】:

    根据数据,看起来我们有 4 个类,所以我更改了加载代码以反映它:

    y_train = np.array([f[0] * 2 + f[1] for f in tmptrain]).astype(np.int32)
    y_test = np.array([f[0] * 2 + f[1] for f in tmptest]).astype(np.int32)
    

    输出层的单元数应该等于类的数量,所以我用SoftMax添加了一个输出层:

    net['fo_class'] = DenseLayer(net['fc_class'],num_units=4,
                        nonlinearity=lasagne.nonlinearities.softmax)
    

    我建议在输入之后删除 dropout 层——你可以比较结果与它和没有它以确保这一点

    Batch size = 2 太小,学习率太高

    以下是包含这些更改的代码示例:

    from __future__ import print_function
    
    import sys
    import os
    import time
    
    import numpy as np
    import theano
    import theano.tensor as T
    
    import lasagne
    import re
    import cv2
    from lasagne.layers import Conv2DLayer, MaxPool2DLayer , DropoutLayer
    from lasagne.layers import InputLayer, DenseLayer
    
    def split_list(a_list):
        half = len(a_list)/2
        return a_list[:half], a_list[half:]
    
    def load_dataset(path=''):
        cat_list = []
        filelist = sorted(os.listdir(path))
        tmptrain = []
        tmptest = []
        max_id = 0
        for f in filelist:
            match = re.match(r'C(\d+)([F|G])(\d+)\.PNG', f)
            id = int(match.group(1)) - 1
            max_id = max(max_id,id)
            fg_class = match.group(2)
            if id not in [p[0] for p in cat_list]:
                cat_list.append([id, [], []])
            if fg_class == 'G':
                cat_list[-1][1].append(f)
            else:
                cat_list[-1][2].append(f)
        for f in cat_list:
            id = f[0]
            trainG, testG = split_list(f[1])
            trainF, testF = split_list(f[2])
            tmptrain = tmptrain + [(id, 1, F) for F in trainF] + [(id, 0, G) for G in trainG]  
            tmptest = tmptest + [(id, 1, F) for F in testF] + [(id, 0, F) for F in testG]
        X_train = np.array([cv2.imread(path+f[2],0) for f in tmptrain]).astype(np.float32)
        y_train = np.array([f[0] * 2 + f[1] for f in tmptrain]).astype(np.int32)
        X_test = np.array([cv2.imread(path+f[2],0) for f in tmptest]).astype(np.float32)
        y_test = np.array([f[0] * 2 + f[1] for f in tmptest]).astype(np.int32)
        fg_train = np.array([f[1] for f in tmptrain]).astype(np.float32)
        fg_test = np.array([f[1] for f in tmptest]).astype(np.float32)
    
        X_train = np.expand_dims(X_train,axis=1).astype(np.float32)
        X_test = np.expand_dims(X_test, axis=1).astype(np.float32)
    
        return X_train, y_train, X_test, y_test, fg_train , fg_test
    
    
    def ExplicitNegativeCorrelation(net,layer='fc2',lr=0.00001):
        for param in lasagne.layers.get_all_params(net[layer]):
            if param.name.startswith('W'):
                W = param
                mean = T.mean(W,0) * lr
                W = W - mean
    
    def ImplicitNegativeCorrelation(MSE,Cross,Hinge):
        mean = T.mean((MSE+Cross+Hinge),axis=0)
        return ((MSE-mean)**2+(Cross-mean)**2+(Hinge-mean)**2)/3
    
    def build_cnn(inputvar,input_shape, trained_weights=None):
    
        net = {}
    
        net['input'] = InputLayer(input_shape,input_var=inputvar)    
        net['conv1'] = Conv2DLayer(net['input'], num_filters=96, filter_size=11, stride=4)
        net['pool1'] = MaxPool2DLayer(net['conv1'], pool_size=3, stride=2)
    
        net['conv2'] = Conv2DLayer(net['pool1'], num_filters=256, filter_size=5, pad=2)
        net['pool2'] = MaxPool2DLayer(net['conv2'], pool_size=3, stride=2)
    
        net['conv3'] = Conv2DLayer(net['pool2'], num_filters=384, filter_size=3, pad=1)
        net['conv4'] = Conv2DLayer(net['conv3'], num_filters=384, filter_size=3, pad=1)
        net['conv5'] = Conv2DLayer(net['conv4'], num_filters=256, filter_size=3, pad=1)
    
        net['pool5'] = MaxPool2DLayer(net['conv5'], pool_size=3, stride=2)
    
        net['fc1'] = DenseLayer(net['pool5'], num_units=2048)
        net['drop_fc1'] = DropoutLayer(net['fc1'])
    
        net['fc2'] = DenseLayer(net['drop_fc1'], num_units=2048)    
        net['fc_class'] = DenseLayer(net['fc2'],num_units=115)
    
        net['fo_class'] = DenseLayer(net['fc_class'],num_units=4,
                            nonlinearity=lasagne.nonlinearities.softmax)
        return net
    
    
    def iterate_minibatches(inputs, targets_class,targets_verif, batchsize, shuffle=False):
        assert len(inputs) == len(targets_class)
        assert len(inputs) == len(targets_verif)
        if shuffle:
            indices = np.arange(len(inputs))
            np.random.shuffle(indices)
        for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
            if shuffle:
                excerpt = indices[start_idx:start_idx + batchsize]
            else:
                excerpt = slice(start_idx, start_idx + batchsize)
            yield inputs[excerpt], targets_class[excerpt], targets_verif[excerpt]
    
    def main(num_epochs=500):
    
        print("Loading data...")
        X_train, y_train, X_test, y_test, fg_train, fg_test = load_dataset('./signatures/tmp4/')
        X_train /= 255
        X_val, y_val, fg_val = X_train, y_train, fg_train
        print(y_train.shape)    
        check =  X_train[0][0]
        print(check)    
    
        input_var = T.tensor4('inputs')
        target_var_class = T.ivector('targets')
    
        network = build_cnn(input_var, (None, 1, 250, 250))
    
        class_prediction = lasagne.layers.get_output(network['fo_class'])  
    
        loss_class = lasagne.objectives.categorical_crossentropy(class_prediction, target_var_class)
    
        loss = loss_class.mean()
        params = lasagne.layers.get_all_params([network['fo_class']], trainable=True)
    
    
        lr = 0.0007
        updates = lasagne.updates.nesterov_momentum(
            loss, params, learning_rate=lr, momentum=0.9)
    
        test_prediction_class = lasagne.layers.get_output(network['fo_class'], deterministic=True)
        test_loss_class = lasagne.objectives.categorical_crossentropy(test_prediction_class,
                                                                target_var_class)
    
        test_loss_class = test_loss_class.mean()
        test_acc_class = T.mean(T.eq(T.argmax(test_prediction_class, axis=1), target_var_class),
                          dtype=theano.config.floatX)                     
    
        predict_class = theano.function([input_var], T.argmax(test_prediction_class,axis=1))
    
        train_fn = theano.function([input_var, target_var_class], loss, updates=updates)
    
        val_fn_class = theano.function([input_var, target_var_class], [test_loss_class, test_acc_class])
    
        print("Starting training...")
        BatchSize = 16
        for epoch in range(num_epochs):
            train_err = 0
            train_batches = 0
            start_time = time.time()
            for batch in iterate_minibatches(X_train, y_train,fg_train, BatchSize, shuffle=True):
                inputs, targets_class, targets_verif = batch
                train_err += train_fn(inputs, targets_class)
                print(targets_class,predict_class(inputs))
                train_batches += 1
    
            val_err_class = 0
            val_acc_class = 0
    
            val_batches = 0
            for batch in iterate_minibatches(X_val, y_val, fg_val, BatchSize, shuffle=False):
                inputs, targets_class, targets_verif = batch
                err_class, acc_class = val_fn_class(inputs, targets_class)
                val_err_class += err_class
                val_acc_class += acc_class
                val_batches += 1
    
            print("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs, time.time() - start_time))
            print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
            print("  Classification loss:\t\t{:.6f}".format(val_err_class / val_batches))
            print("  Classification accuracy:\t\t{:.2f} %".format(
                val_acc_class / val_batches * 100))
    
        test_err_class = 0
        test_acc_class = 0
        test_batches = 0
        for batch in iterate_minibatches(X_test, y_test, fg_test, BatchSize, shuffle=False):
            inputs, targets_class, targets_verif = batch
            err_class, acc_class = val_fn_class(inputs, targets_class)
            test_err_class += err_class
            test_acc_class += acc_class
            test_batches += 1
        print("Final results:")
        print("  test loss (Classification):\t\t\t{:.6f}".format(test_err_class / test_batches))
        print("  test accuracy (Classification):\t\t{:.2f} %".format(
            test_acc_class / test_batches * 100))
    
    if __name__ == '__main__':
       main()
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2016-06-20
      • 2018-04-13
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多