上一篇我们利用数值微分计算了神经网络的权重参数的梯度(损失函数关于权重参数的梯度),虽然微分比较简单,但是在计算上比较耗费时间。现在,我们引入一个另一个计算权重参数梯度的方法---“误差反向传播法”, 速度比数值微分快很多!
~~~~~~~~~~~~~~~~~~~~~~各个层的推导过程略~~~~~~~~~~~~~~~~~~~~
class MulLayer:
def __init__(self):
self.x = None
self.y = None
def forward(self, x, y):
self.x = x
self.y = y
out = x * y
return out
def backward(self, dout):
dx = dout * self.y
dy = dout * self.x
return dx, dy
【练习乘法层】:
apple = 100
apple_num = 2
tax = 1.1
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()
# 正向传播
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)
# 反向传播
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)
print("price:", int(price))
print("dApple:", dapple)
print("dApple_num:", int(dapple_num))
print("dTax:", dtax)
price: 220
dApple: 2.2
dApple_num: 110
dTax: 200
class AddLayer:
def __init__(self):
pass
def forward(self, x, y):
out = x + y
return out
def backward(self, dout):
dx = dout * 1
dy = dout * 1
return dx, dy
【练习使用乘法、加法层】:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1
# 实例化
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()
# 正向传播
apple_price = mul_apple_layer.forward(apple, apple_num) # (1)
orange_price = mul_orange_layer.forward(orange, orange_num) # (2)
all_price = add_apple_orange_layer.forward(apple_price, orange_price) # (3)
price = mul_tax_layer.forward(all_price, tax) # (4)
# 反向传播
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice) # (4)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price) # (3)
dorange, dorange_num = mul_orange_layer.backward(dorange_price) # (2)
dapple, dapple_num = mul_apple_layer.backward(dapple_price) # (1)
print("price:", int(price))
print("dApple:", dapple)
print("dApple_num:", int(dapple_num))
print("dOrange:", dorange)
print("dOrange_num:", int(dorange_num))
print("dTax:", dtax)
price: 715
dApple: 2.2
dApple_num: 110
dOrange: 3.3000000000000003
dOrange_num: 165
dTax: 650
class Relu:
"""
Relu函数,反向传播时,x>0则会将上游的值原封不动的传递给下游(dx = dout)
x<0则会将信号停在这里(dout=0)
先将输入数据转换为True和False的mask数组
"""
def __init__(self):
self.mask = None
def forward(self, x):
self.mask = (x <= 0)
out = x.copy()
out[self.mask] = 0
return out
def backward(self, dout):
dout[self.mask] = 0
dx = dout * 1
return dx
class Sigmoid:
def __init__(self):
self.out = None
def forward(self, x):
out = sigmoid(x)
self.out = out
return out
def backward(self, dout):
dx = dout * (1.0 - self.out) * self.out
return dx
class Affine:
def __init__(self, w, b):
self.w = w
self.b = b
self.x = None
self.dw = None
self.db = None
def forward(self, x):
self.x = x
out = np.dot(x, self.w) + self.b
return out
def backward(self, dout):
dx = np.dot(dout, self.w.T)
self.dw = np.dot(self.x.T, dout)
self.db = np.sum(dout, axis=0)
return dx
import numpy as np
def softmax(x):
if x.ndim == 2:
x = x.T
x = x - np.max(x, axis=0)
y = np.exp(x) / np.sum(np.exp(x), axis=0)
return y.T
x = x - np.max(x) # 溢出对策
return np.exp(x) / np.sum(np.exp(x))
# 交叉熵误差
def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
# 监督数据是one-hot-vector的情况下,转换为正确解标签的索引
if t.size == y.size:
t = t.argmax(axis=1)
batch_size = y.shape[0]
return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
class SoftmaxWithLoss:
def __init__(self, w, b):
self.loss = None # 损失
self.y = None # softmax的输出
self.t = None # 监督数据(one-hot vector)
def forward(self, x, t):
self.t = t
self.y = softmax(x)
self.loss = cross_entropy_error(self.y, self.t)
return self.loss
def backward(self, dout):
batch_size = self.t.shape[0]
dx = (self.y - self.t) / batch_size
return dx
from collections import OrderedDict
from Deep_Learning_From_Scratch.dataset.mnist import load_mnist
import time
import numpy as np
# ---------------------------------Affine层--------------------------------------------------
class Affine:
def __init__(self, W, b):
self.W = W
self.b = b
self.x = None
self.dW = None
self.db = None
def forward(self, x):
self.x = x
out = np.dot(x, self.W) + self.b
return out
def backward(self, dout):
dx = np.dot(dout, self.W.T)
self.dW = np.dot(self.x.T, dout)
self.db = np.sum(dout, axis=0)
return dx
# ---------------------------------------Relu层----------------------------------------------
class Relu():
"""
Relu函数,反向传播时:x>0则会将上游的值原封不动的传递给下游(dx = dout)
x<0则会将信号停在这里(dout=0)
"""
def __init__(self):
self.mask = None # mask:由True/Fase组成的numpy数组。
def forward(self, x):
self.mask = (x <= 0) # mask会将x元素小于等于0的地方保存为True,其他地方都保存为False
out = x.copy() # False的地方输出为x
out[self.mask] = 0 # 将True的地方输出为0
return out
def backward(self, dout):
dout[self.mask] = 0 # 前面保存了mask,True的地方反向传播会停在这个地方,故TRUE的地方设置为0,False的地方是将上游的值原封不动的传递给下游
dx = dout
return dx
# ---------------------------激活函数定义---------------------------------------------------
def softmax(x):
if x.ndim == 2: # 多维数组
x = x.T
x = x - np.max(x, axis=0)
y = np.exp(x) / np.sum(np.exp(x), axis=0)
return y.T
x = x - np.max(x) # 溢出对策
return np.exp(x) / np.sum(np.exp(x))
# ---------------------------定义损失函数(交叉熵误差)------------------------------------------------
def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
# 监督数据是one-hot-vector的情况下,转换为正确解标签的索引
if t.size == y.size:
t = t.argmax(axis=1)
batch_size = y.shape[0]
return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
# -------------------------------SoftmaxWithLoss层--------------------------------------------
class SoftmaxWithLoss:
def __init__(self):
self.loss = None # 损失
self.y = None # 输出
self.t = None # 监督数据(one-hot)
def forward(self, x, t):
self.t = t
self.y = softmax(x)
self.loss = cross_entropy_error(self.y, self.t)
return self.loss
def backward(self, dout=1):
batch_size = self.t.shape[0]
if self.t.size == self.y.size: # 监督数据是one-hot-vector的情况
dx = (self.y - self.t) / batch_size
else:
dx = self.y.copy()
dx[np.arange(batch_size), self.t] -= 1
dx = dx / batch_size
return dx
### ================================================================================
# # 数值微分(对比数值微分和误差反向传播)两种方法都是求的梯度之间的误差
# def numerical_gradient_no_batch(f, x):
# h = 1e-04
# grad = np.zeros_like(x)
#
# for index in range(x.size):
# tmp_value = x[index] # 先将数组的值存着
#
# x[index] = tmp_value + h # f(x+h)
# fxh1 = f(x)
# x[index] = tmp_value - h # f(x-h)
# fxh2 = f(x)
# grad[index] = (fxh1 - fxh2) / (2* h)
# x[index] = tmp_value # 数值还原
#
# return grad
#
#
# def numerical_gradient(f, X):
# if X.ndim == 1:
# return numerical_gradient_no_batch(f, X)
# else:
# grad = np.zeros_like(X)
#
# for idx, x in enumerate(X):
# grad[idx] = numerical_gradient_no_batch(f, x)
#
# return grad
### ================================================================================
# ----------------------------------两层神经网络---------------------------------------------
class TwoLayersNet:
def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
# 初始化权重
self.params = {} # 参数字典
# 初始化输入层到隐藏层的权重和偏置,高斯随机生成形状为(input_szie, hidden_size)的二维数组
self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)
# 生成层
self.layers = OrderedDict() # 创建有序字典((可以记住向字典中添加元素的顺序,在反向传播时只需要按相反的顺序调用各层即可))
# Affine1层
self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
# Relu1层
self.layers['Relu1'] = Relu()
# Affine2层
self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
# softmax_with_loss层
self.lastlayer = SoftmaxWithLoss()
# 进行识别(推理)
def predict(self, x):
for layer in self.layers.values():
x = layer.forward(x)
return x
# 计算损失函数的值
def loss(self, x, t):
y = self.predict(x)
return self.lastlayer.forward(y, t)
# 计算识别精度
def accuracy(self, x, t):
y = self.predict(x) # 推理
y = np.argmax(y, axis=1) # 最大值的索引
if t.ndim != 1: t = np.argmax(t, axis=1) # 如果监督数据不是按照one-hot表示的(监督数据不是1维的)
# 如果索引相同,即识别正确,计算精度
accuracy = np.sum(y == t) / float(x.shape[0]) # x.shape[0]是整个数据的数量
return accuracy
# 计算权重参数梯度
# 通过误差反向传播计算关于权重的梯度
def gradient(self, x, t):
# forward
self.loss(x, t)
# backward
dout = 1
dout = self.lastlayer.backward(dout)
layers = list(self.layers.values())
layers.reverse() # 逆序
for layer in layers:
dout = layer.backward(dout)
# 设定
grads = {}
grads['W1'] = self.layers['Affine1'].dW
grads['b1'] = self.layers['Affine1'].db
grads['W2'] = self.layers['Affine2'].dW
grads['b2'] = self.layers['Affine2'].db
return grads # 返回各个参数的梯度
# # 计算权重参数梯度 x:输入数据, t:监督数据
# # 通过数值微分计算(用于对比)
# def numerical_gradient(self, x, t):
# loss_W = lambda W: self.loss(x, t)
# grads = {}
# grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
# grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
# grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
# grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
#
# return grads # 返回各个参数的梯度
# 读入数据
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
# 开始计时
start = time.time()
# 神经网络实例化
network = TwoLayersNet(input_size=784, hidden_size=50, output_size=10)
# 参数初始化
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)
for i in range(iters_num): # 循环10000次
# 随机选100个
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]
# 梯度
grad = network.gradient(x_batch, t_batch)
# grad = network.numerical_gradient(x_batch, t_batch)
# 更新
for key in ('W1', 'b1', 'W2', 'b2'):
network.params[key] -= learning_rate * grad[key]
# 损失
loss = network.loss(x_batch, t_batch)
train_loss_list.append(loss)
# 每取够60000个后,计算精度:2个精度
if i % iter_per_epoch == 0:
train_acc = network.accuracy(x_train, t_train)
test_acc = network.accuracy(x_test, t_test)
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)
print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))
# 结束计时
end = time.time()
print('利用反向传播法,神经网络耗时:', end - start)
评论