博客详情页

1.从人工设计规则到机器自主学习

avatar 【说明】：黄色区域没有“人为介入”！神经网络的优点是所有问题都可以用同样的流程来解决，通过不断地学习所提供的数据，尝试发现待求解的问题的模式，将数据直接作为原始数据进行“端对端”的学习。

2.损失函数

2.1 均方误差

avatar

# 均方误差
def mean_squared_error(y, t):
    return 0.5 * np.sum((y-t)**2)

# 模拟数据：
# t为目标值的one-hot值，手写数字2
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
# y为预测出的概率情况，由softmax函数输出的结果
y1 = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
y2 = [0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0]

result1 = mean_squared_error(np.array(y1), np.array(t))
print(np.argmax(y1))
print(result1)

result2 = mean_squared_error(np.array(y2), np.array(t))
print(np.argmax(y2))
print(result2)

2
0.09750000000000003
7
0.5975

2.2 交叉熵误差

avatar

import numpy as np
import matplotlib.pylab as plt

# 绘制交叉熵误差函数图像
x = np.arange(1e-7, 1., 0.001)
y = - np.log(x)
plt.plot(x, y)
plt.ylim(0., 5.3)
plt.show()

avatar

# 定义交叉熵误差
def cross_entropy_error(y, t):
    delta = 1e-7
    return -np.sum(t * np.log(y + delta))

# 模拟数据：
# t为目标值的one-hot值，手写数字2
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
# y为预测出的概率情况，由softmax函数输出的结果
y1 = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
y2 = [0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0]

result1 = cross_entropy_error(np.array(y1), np.array(t))
print(result1)

result2 = cross_entropy_error(np.array(y2), np.array(t))
print(result2)

0.510825457099338
2.302584092994546

2.3 mini-batch

深度学习的优化算法，说白了就是梯度下降。每次的参数更新有两种方式。第一种，遍历全部数据集算一次损失函数，然后算函数对各个参数的梯度，更新梯度。这种方法每更新一次参数都要把数据集里的所有样本都看一遍，计算量开销大，计算速度慢，不支持在线学习，这称为Batch gradient descent，批梯度下降。

另一种，每看一个数据就算一下损失函数，然后求梯度更新参数，这个称为随机梯度下降，stochastic gradient descent。这个方法速度比较快，但是收敛性能不太好，可能在最优点附近晃来晃去，hit不到最优点。两次参数的更新也有可能互相抵消掉，造成目标函数震荡的比较剧烈。

为了克服两种方法的缺点，现在一般采用的是一种折中手段，mini-batch gradient decent，小批的梯度下降，这种方法把数据分为若干个批，按批来更新参数，这样，一个批中的一组数据共同决定了本次梯度的方向，下降起来就不容易跑偏，减少了随机性。另一方面因为批的样本数与整个数据集相比小了很多，计算量也不是很大。

这里以交叉熵误差为例：

avatar

import numpy as np

def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + 1e-7)) / batch_size

t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
y1 = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]

result = cross_entropy_error(np.array(y1), np.array(t))
print(result)

0.510825457099338

t = [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
     [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
     [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
     [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]]

y2 = [[0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0],
      [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0],
      [0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0],
      [0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0]]

result = cross_entropy_error(np.array(y2), np.array(t))
print(result)

1.406704775046942

# 如果目标值不再使用one-hot
t = [2, 2, 2, 2]

def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] +  1e-7)) / batch_size

result = cross_entropy_error(np.array(y2), np.array(t))
print(result)

1.406704775046942

【说明】：下面的导数相关应用，是为了帮助找到损失函数的最小值！

3.导数

import numpy as np
import matplotlib.pylab as plt

# 求导：利用中心差分
def numerical_diff(f, x):
    h = 1e-4 # 0.0001
    return (f(x+h) - f(x-h)) / (2*h)

# 函数f1
def f1(x):
    return x**2

# 函数f1在x=5处的导数
x = np.arange(-20.0, 20.0, 0.1)
print(numerical_diff(f1, 5))

9.999999999976694

4.偏导数

def numerical_diff(f, x):
    h = 1e-4  # 0.0001
    return (f(x+h) - f(x-h)) / (2*h)

#  y=x0^2 + x1^2
def f2(x):
    return x[0]**2 + x[1]**2

#  f2(x)在点(3,4)处对x1偏导
def f2_temp(x):
    return 3.0**2.0 + x*x

print(numerical_diff(f2_temp, 4.0))

7.999999999999119

5.梯度

import numpy as np

# 梯度
def _numerical_gradient_no_batch(f, x):
    h = 1e-4  # 0.0001
    grad = np.zeros_like(x)

    for idx in range(x.size):
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x)  # f(x+h)

        x[idx] = tmp_val - h
        fxh2 = f(x)  # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2 * h)

        # 还原值
        x[idx] = tmp_val

    return grad

def f2(x):
    return x[0]**2 + x[1]**2

print(_numerical_gradient_no_batch(f2, np.array([3.0, 4.0])))

[6. 8.]

import numpy as np
import matplotlib.pylab as plt


def _numerical_gradient_no_batch(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)

    for idx in range(x.size):
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)

        x[idx] = tmp_val - h
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)

        x[idx] = tmp_val # 还原值

    return grad


def numerical_gradient(f, X):
    if X.ndim == 1:
        return _numerical_gradient_no_batch(f, X)
    else:
        grad = np.zeros_like(X)
        
        for idx, x in enumerate(X):
            grad[idx] = _numerical_gradient_no_batch(f, x)
        
        return grad


def function_2(x):
    if x.ndim == 1:
        return np.sum(x**2)
    else:
        return np.sum(x**2, axis=1)


def tangent_line(f, x):
    d = numerical_gradient(f, x)
    print(d)
    y = f(x) - d*x
    return lambda t: d*t + y
     
if __name__ == '__main__':
    x0 = np.arange(-2, 2.5, 0.25)
    x1 = np.arange(-2, 2.5, 0.25)
    X, Y = np.meshgrid(x0, x1)
    
    X = X.flatten()
    Y = Y.flatten()
    
    grad = numerical_gradient(function_2, np.array([X, Y]) )
    
    plt.figure()
    plt.quiver(X, Y, -grad[0], -grad[1],  angles="xy",color="#666666")#,headwidth=10,scale=40,color="#444444")
    plt.xlim([-2, 2])
    plt.ylim([-2, 2])
    plt.xlabel('x0')
    plt.ylabel('x1')
    plt.grid()
    plt.legend()
    plt.draw()
    plt.show()

avatar

6.梯度下降

import numpy as np
from Deep_Learning_From_Scratch.ch04.gradient_2d import numerical_gradient

def gradient_descent(f, init_x, lr=0.01, step_num=100):
    """
    :param f: 函数
    :param init_x: 起点
    :param lr: 学习率
    :param step_num: 走多少次(梯度法重复的次数)
    :return: x是最后到达的点; np.array(x_history)是历史记录
    """
    x = init_x
    x_history = []

    for i in range(step_num):
        x_history.append( x.copy() )

        grad = numerical_gradient(f, x)
        x -= lr * grad

    return x, np.array(x_history)


def f2(x):
    return x[0]**2 + x[1]**2

x_init = np.array([-3.0, 4.0])
destination, history = gradient_descent(f2, x_init)
print(destination)

# 学习率很大时候：会发散成一个很大的值
x_init = np.array([-3.0, 4.0])
destination, history = gradient_descent(f2, x_init, lr=10)
print(destination)

# 学习率很小时候：还没怎么更新就结束了
x_init = np.array([-3.0, 4.0])
destination, history = gradient_descent(f2, x_init, lr=1e-15)
print(destination)

[-0.39785867  0.53047822]
[-2.58983747e+13 -1.29524862e+12]
[-3.  4.]

【说明】：学习率lr是超参数(人工设定）,与神经网络的参数(权重、偏置)性质不同(学习算法自动获取）。一般情况下，超参数需要多设定几个进行对比，选择最优值。

7.两层神经网络(基于随机梯度下降法)

import numpy as np

def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T

    x = x - np.max(x)  # 溢出对策
    return np.exp(x) / np.sum(np.exp(x))


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def sigmoid_grad(x):
    return (1.0 - sigmoid(x)) * sigmoid(x)


# 交叉熵误差
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)

    # 监督数据是one-hot-vector的情况下，转换为正确解标签的索引
    if t.size == y.size:
        t = t.argmax(axis=1)

    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size


def numerical_gradient(f, x):
    h = 1e-4  # 0.0001
    grad = np.zeros_like(x)

    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x)  # f(x+h)

        x[idx] = tmp_val - h
        fxh2 = f(x)  # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2 * h)

        x[idx] = tmp_val  # 还原值
        it.iternext()

    return grad


class TwoLayerNet:

	# 初始化权重
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    # 计算
    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']

        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)

        return y

    # x:输入数据, t:监督数据
    def loss(self, x, t):
        y = self.predict(x)

        return cross_entropy_error(y, t)

    # 准确度
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    # x:输入数据, t:监督数据
    def numerical_gradient(self, x, t):
        # 基于数值微分计算梯度
        loss_W = lambda W: self.loss(x, t)

        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])

        return grads
	
	# 误差反向传播法
    def gradient(self, x, t):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        grads = {}

        batch_num = x.shape[0]

        # forward
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)

        # backward
        dy = (y - t) / batch_num
        grads['W2'] = np.dot(z1.T, dy)
        grads['b2'] = np.sum(dy, axis=0)

        da1 = np.dot(dy, W2.T)
        dz1 = sigmoid_grad(a1) * da1
        grads['W1'] = np.dot(x.T, dz1)
        grads['b1'] = np.sum(dz1, axis=0)

        return grads

# 读入数据：normalize=True 让0-255变成0-1之间的数字
# 训练集特征的形状： (60000, 784)
# 训练集目标的形状： (60000, 10)
# 训练集特征的形状： (10000, 784)
# 训练集目标的形状： (10000, 10)
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)


# 生成两层神经网络的实例：
# W1的形状： (784, 50)
# b1的形状： (50,)
# W2的形状： (50, 10)
# b2的形状： (10,)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

# 超参数初始化
iters_num = 10000  # 适当设定循环的次数：60000里面每次选100个，并且循环10000次
train_size = x_train.shape[0]
batch_size = 100  # 一批100个
learning_rate = 0.1  # 学习率：权重更新的参数

# 保存历史记录
train_loss_list = []  # 每次更新权重后，计算损失
train_acc_list = []   # 训练集的精度
test_acc_list = []    # 测试集的精度

# epoch参数：何时能够轮一次，60000训练集随机每次抽100个，经过600次能完成一次60000
iter_per_epoch = max(train_size / batch_size, 1)  # 60000/100=600 600次

for i in range(iters_num):  # [0, 10000)
    # 60000中随机生成索引：100个
    batch_mask = np.random.choice(train_size, batch_size)
    # 将随机索引对应的训练集抽出来
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    # 计算梯度
    grad = network.gradient(x_batch, t_batch)

    # 更新参数
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]

    # 计算损失函数，并存下来
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)

    # 当循环每经历600次，计算一次精度
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)  # 训练集精度
        test_acc = network.accuracy(x_test, t_test)  # 测试集精度
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))

# 绘制图形
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_acc_list))
plt.plot(x, train_acc_list, label='train acc')
plt.plot(x, test_acc_list, label='test acc', linestyle='--')
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()

train acc, test acc | 0.09915, 0.1009
train acc, test acc | 0.7826833333333333, 0.7898
train acc, test acc | 0.8796, 0.8827
train acc, test acc | 0.8986333333333333, 0.9018
train acc, test acc | 0.90725, 0.9092
train acc, test acc | 0.91345, 0.9152
train acc, test acc | 0.9187333333333333, 0.9209
train acc, test acc | 0.9218166666666666, 0.9232
train acc, test acc | 0.9260166666666667, 0.9263
train acc, test acc | 0.9291, 0.9291
train acc, test acc | 0.9320833333333334, 0.9323
train acc, test acc | 0.9346166666666667, 0.9357
train acc, test acc | 0.9372333333333334, 0.9372
train acc, test acc | 0.9402166666666667, 0.9381
train acc, test acc | 0.9423833333333334, 0.941
train acc, test acc | 0.9443166666666667, 0.9424
train acc, test acc | 0.9458, 0.9443

avatar

# 查看损失函数随着训练次数的变化情况：
x = np.arange(len(train_loss_list))
plt.plot(x, train_loss_list, label='train_loss')
plt.xlabel("iteration")
plt.ylabel("loss")
plt.xlim(0, 10000)
plt.ylim(0, 5)
plt.show()

avatar

深度学习入门03---神经网络的“学习”

1.从人工设计规则到机器自主学习

2.损失函数

2.1 均方误差

2.2 交叉熵误差

2.3 mini-batch

3.导数

4.偏导数

5.梯度

6.梯度下降

7.两层神经网络(基于随机梯度下降法)

评论