【说明】:黄色区域没有“人为介入”!神经网络的优点是所有问题都可以用同样的流程来解决,通过不断地学习所提供的数据,尝试发现待求解的问题的模式,将数据直接作为原始数据进行“端对端”的学习。
# 均方误差
def mean_squared_error(y, t):
return 0.5 * np.sum((y-t)**2)
# 模拟数据:
# t为目标值的one-hot值,手写数字2
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
# y为预测出的概率情况,由softmax函数输出的结果
y1 = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
y2 = [0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0]
result1 = mean_squared_error(np.array(y1), np.array(t))
print(np.argmax(y1))
print(result1)
result2 = mean_squared_error(np.array(y2), np.array(t))
print(np.argmax(y2))
print(result2)
2
0.09750000000000003
7
0.5975
import numpy as np
import matplotlib.pylab as plt
# 绘制交叉熵误差函数图像
x = np.arange(1e-7, 1., 0.001)
y = - np.log(x)
plt.plot(x, y)
plt.ylim(0., 5.3)
plt.show()
# 定义交叉熵误差
def cross_entropy_error(y, t):
delta = 1e-7
return -np.sum(t * np.log(y + delta))
# 模拟数据:
# t为目标值的one-hot值,手写数字2
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
# y为预测出的概率情况,由softmax函数输出的结果
y1 = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
y2 = [0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0]
result1 = cross_entropy_error(np.array(y1), np.array(t))
print(result1)
result2 = cross_entropy_error(np.array(y2), np.array(t))
print(result2)
0.510825457099338
2.302584092994546
深度学习的优化算法,说白了就是梯度下降。每次的参数更新有两种方式。 第一种,遍历全部数据集算一次损失函数,然后算函数对各个参数的梯度,更新梯度。这种方法每更新一次参数都要把数据集里的所有样本都看一遍,计算量开销大,计算速度慢,不支持在线学习,这称为Batch gradient descent,批梯度下降。
另一种,每看一个数据就算一下损失函数,然后求梯度更新参数,这个称为随机梯度下降,stochastic gradient descent。这个方法速度比较快,但是收敛性能不太好,可能在最优点附近晃来晃去,hit不到最优点。两次参数的更新也有可能互相抵消掉,造成目标函数震荡的比较剧烈。
为了克服两种方法的缺点,现在一般采用的是一种折中手段,mini-batch gradient decent,小批的梯度下降,这种方法把数据分为若干个批,按批来更新参数,这样,一个批中的一组数据共同决定了本次梯度的方向,下降起来就不容易跑偏,减少了随机性。另一方面因为批的样本数与整个数据集相比小了很多,计算量也不是很大。
这里以交叉熵误差为例:
import numpy as np
def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
batch_size = y.shape[0]
return -np.sum(t * np.log(y + 1e-7)) / batch_size
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
y1 = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
result = cross_entropy_error(np.array(y1), np.array(t))
print(result)
0.510825457099338
t = [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]]
y2 = [[0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0],
[0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0],
[0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0],
[0.1, 0.05, 0.1, 0.0, 0.05, 0.1, 0.0, 0.6, 0.0, 0.0]]
result = cross_entropy_error(np.array(y2), np.array(t))
print(result)
1.406704775046942
# 如果目标值不再使用one-hot
t = [2, 2, 2, 2]
def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
batch_size = y.shape[0]
return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
result = cross_entropy_error(np.array(y2), np.array(t))
print(result)
1.406704775046942
【说明】:下面的导数相关应用,是为了帮助找到损失函数的最小值!
import numpy as np
import matplotlib.pylab as plt
# 求导:利用中心差分
def numerical_diff(f, x):
h = 1e-4 # 0.0001
return (f(x+h) - f(x-h)) / (2*h)
# 函数f1
def f1(x):
return x**2
# 函数f1在x=5处的导数
x = np.arange(-20.0, 20.0, 0.1)
print(numerical_diff(f1, 5))
9.999999999976694
def numerical_diff(f, x):
h = 1e-4 # 0.0001
return (f(x+h) - f(x-h)) / (2*h)
# y=x0^2 + x1^2
def f2(x):
return x[0]**2 + x[1]**2
# f2(x)在点(3,4)处对x1偏导
def f2_temp(x):
return 3.0**2.0 + x*x
print(numerical_diff(f2_temp, 4.0))
7.999999999999119
import numpy as np
# 梯度
def _numerical_gradient_no_batch(f, x):
h = 1e-4 # 0.0001
grad = np.zeros_like(x)
for idx in range(x.size):
tmp_val = x[idx]
x[idx] = float(tmp_val) + h
fxh1 = f(x) # f(x+h)
x[idx] = tmp_val - h
fxh2 = f(x) # f(x-h)
grad[idx] = (fxh1 - fxh2) / (2 * h)
# 还原值
x[idx] = tmp_val
return grad
def f2(x):
return x[0]**2 + x[1]**2
print(_numerical_gradient_no_batch(f2, np.array([3.0, 4.0])))
[6. 8.]
import numpy as np
import matplotlib.pylab as plt
def _numerical_gradient_no_batch(f, x):
h = 1e-4 # 0.0001
grad = np.zeros_like(x)
for idx in range(x.size):
tmp_val = x[idx]
x[idx] = float(tmp_val) + h
fxh1 = f(x) # f(x+h)
x[idx] = tmp_val - h
fxh2 = f(x) # f(x-h)
grad[idx] = (fxh1 - fxh2) / (2*h)
x[idx] = tmp_val # 还原值
return grad
def numerical_gradient(f, X):
if X.ndim == 1:
return _numerical_gradient_no_batch(f, X)
else:
grad = np.zeros_like(X)
for idx, x in enumerate(X):
grad[idx] = _numerical_gradient_no_batch(f, x)
return grad
def function_2(x):
if x.ndim == 1:
return np.sum(x**2)
else:
return np.sum(x**2, axis=1)
def tangent_line(f, x):
d = numerical_gradient(f, x)
print(d)
y = f(x) - d*x
return lambda t: d*t + y
if __name__ == '__main__':
x0 = np.arange(-2, 2.5, 0.25)
x1 = np.arange(-2, 2.5, 0.25)
X, Y = np.meshgrid(x0, x1)
X = X.flatten()
Y = Y.flatten()
grad = numerical_gradient(function_2, np.array([X, Y]) )
plt.figure()
plt.quiver(X, Y, -grad[0], -grad[1], angles="xy",color="#666666")#,headwidth=10,scale=40,color="#444444")
plt.xlim([-2, 2])
plt.ylim([-2, 2])
plt.xlabel('x0')
plt.ylabel('x1')
plt.grid()
plt.legend()
plt.draw()
plt.show()
import numpy as np
from Deep_Learning_From_Scratch.ch04.gradient_2d import numerical_gradient
def gradient_descent(f, init_x, lr=0.01, step_num=100):
"""
:param f: 函数
:param init_x: 起点
:param lr: 学习率
:param step_num: 走多少次(梯度法重复的次数)
:return: x是最后到达的点; np.array(x_history)是历史记录
"""
x = init_x
x_history = []
for i in range(step_num):
x_history.append( x.copy() )
grad = numerical_gradient(f, x)
x -= lr * grad
return x, np.array(x_history)
def f2(x):
return x[0]**2 + x[1]**2
x_init = np.array([-3.0, 4.0])
destination, history = gradient_descent(f2, x_init)
print(destination)
# 学习率很大时候:会发散成一个很大的值
x_init = np.array([-3.0, 4.0])
destination, history = gradient_descent(f2, x_init, lr=10)
print(destination)
# 学习率很小时候:还没怎么更新就结束了
x_init = np.array([-3.0, 4.0])
destination, history = gradient_descent(f2, x_init, lr=1e-15)
print(destination)
[-0.39785867 0.53047822]
[-2.58983747e+13 -1.29524862e+12]
[-3. 4.]
【说明】:学习率lr是超参数(人工设定),与神经网络的参数(权重、偏置)性质不同(学习算法自动获取)。 一般情况下,超参数需要多设定几个进行对比,选择最优值。
import numpy as np
def softmax(x):
if x.ndim == 2:
x = x.T
x = x - np.max(x, axis=0)
y = np.exp(x) / np.sum(np.exp(x), axis=0)
return y.T
x = x - np.max(x) # 溢出对策
return np.exp(x) / np.sum(np.exp(x))
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_grad(x):
return (1.0 - sigmoid(x)) * sigmoid(x)
# 交叉熵误差
def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
# 监督数据是one-hot-vector的情况下,转换为正确解标签的索引
if t.size == y.size:
t = t.argmax(axis=1)
batch_size = y.shape[0]
return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
def numerical_gradient(f, x):
h = 1e-4 # 0.0001
grad = np.zeros_like(x)
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
idx = it.multi_index
tmp_val = x[idx]
x[idx] = float(tmp_val) + h
fxh1 = f(x) # f(x+h)
x[idx] = tmp_val - h
fxh2 = f(x) # f(x-h)
grad[idx] = (fxh1 - fxh2) / (2 * h)
x[idx] = tmp_val # 还原值
it.iternext()
return grad
class TwoLayerNet:
# 初始化权重
def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
self.params = {}
self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)
# 计算
def predict(self, x):
W1, W2 = self.params['W1'], self.params['W2']
b1, b2 = self.params['b1'], self.params['b2']
a1 = np.dot(x, W1) + b1
z1 = sigmoid(a1)
a2 = np.dot(z1, W2) + b2
y = softmax(a2)
return y
# x:输入数据, t:监督数据
def loss(self, x, t):
y = self.predict(x)
return cross_entropy_error(y, t)
# 准确度
def accuracy(self, x, t):
y = self.predict(x)
y = np.argmax(y, axis=1)
t = np.argmax(t, axis=1)
accuracy = np.sum(y == t) / float(x.shape[0])
return accuracy
# x:输入数据, t:监督数据
def numerical_gradient(self, x, t):
# 基于数值微分计算梯度
loss_W = lambda W: self.loss(x, t)
grads = {}
grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
return grads
# 误差反向传播法
def gradient(self, x, t):
W1, W2 = self.params['W1'], self.params['W2']
b1, b2 = self.params['b1'], self.params['b2']
grads = {}
batch_num = x.shape[0]
# forward
a1 = np.dot(x, W1) + b1
z1 = sigmoid(a1)
a2 = np.dot(z1, W2) + b2
y = softmax(a2)
# backward
dy = (y - t) / batch_num
grads['W2'] = np.dot(z1.T, dy)
grads['b2'] = np.sum(dy, axis=0)
da1 = np.dot(dy, W2.T)
dz1 = sigmoid_grad(a1) * da1
grads['W1'] = np.dot(x.T, dz1)
grads['b1'] = np.sum(dz1, axis=0)
return grads
# 读入数据:normalize=True 让0-255变成0-1之间的数字
# 训练集特征的形状: (60000, 784)
# 训练集目标的形状: (60000, 10)
# 训练集特征的形状: (10000, 784)
# 训练集目标的形状: (10000, 10)
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
# 生成两层神经网络的实例:
# W1的形状: (784, 50)
# b1的形状: (50,)
# W2的形状: (50, 10)
# b2的形状: (10,)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
# 超参数初始化
iters_num = 10000 # 适当设定循环的次数:60000里面每次选100个,并且循环10000次
train_size = x_train.shape[0]
batch_size = 100 # 一批100个
learning_rate = 0.1 # 学习率:权重更新的参数
# 保存历史记录
train_loss_list = [] # 每次更新权重后,计算损失
train_acc_list = [] # 训练集的精度
test_acc_list = [] # 测试集的精度
# epoch参数:何时能够轮一次,60000训练集随机每次抽100个,经过600次能完成一次60000
iter_per_epoch = max(train_size / batch_size, 1) # 60000/100=600 600次
for i in range(iters_num): # [0, 10000)
# 60000中随机生成索引:100个
batch_mask = np.random.choice(train_size, batch_size)
# 将随机索引对应的训练集抽出来
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]
# 计算梯度
grad = network.gradient(x_batch, t_batch)
# 更新参数
for key in ('W1', 'b1', 'W2', 'b2'):
network.params[key] -= learning_rate * grad[key]
# 计算损失函数,并存下来
loss = network.loss(x_batch, t_batch)
train_loss_list.append(loss)
# 当循环每经历600次,计算一次精度
if i % iter_per_epoch == 0:
train_acc = network.accuracy(x_train, t_train) # 训练集精度
test_acc = network.accuracy(x_test, t_test) # 测试集精度
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)
print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))
# 绘制图形
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_acc_list))
plt.plot(x, train_acc_list, label='train acc')
plt.plot(x, test_acc_list, label='test acc', linestyle='--')
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()
train acc, test acc | 0.09915, 0.1009
train acc, test acc | 0.7826833333333333, 0.7898
train acc, test acc | 0.8796, 0.8827
train acc, test acc | 0.8986333333333333, 0.9018
train acc, test acc | 0.90725, 0.9092
train acc, test acc | 0.91345, 0.9152
train acc, test acc | 0.9187333333333333, 0.9209
train acc, test acc | 0.9218166666666666, 0.9232
train acc, test acc | 0.9260166666666667, 0.9263
train acc, test acc | 0.9291, 0.9291
train acc, test acc | 0.9320833333333334, 0.9323
train acc, test acc | 0.9346166666666667, 0.9357
train acc, test acc | 0.9372333333333334, 0.9372
train acc, test acc | 0.9402166666666667, 0.9381
train acc, test acc | 0.9423833333333334, 0.941
train acc, test acc | 0.9443166666666667, 0.9424
train acc, test acc | 0.9458, 0.9443
# 查看损失函数随着训练次数的变化情况:
x = np.arange(len(train_loss_list))
plt.plot(x, train_loss_list, label='train_loss')
plt.xlabel("iteration")
plt.ylabel("loss")
plt.xlim(0, 10000)
plt.ylim(0, 5)
plt.show()
评论