卷积神经网络的演化(2)-- 网络加深

VGGNet在2014年的ImageNet比赛中取得了很好的成绩,展示出网络的深度是算法优良性能的关键部分。为了使网络建设得更深,2015年的ResNet引入了直连思想,提出了一种残差学习结构。

VGGNet

VGGNet在2014年的ImageNet比赛中取得了很好的成绩,总结出了卷积神经网络的深度增加和小卷积核的使用对网络的最终分类识别效果有很大的作用。
VGGNet包含很多级别的网络,深度从11层到19层不等,比较常用的是VGGNet-16和VGGNet-19。
VGG-16网络结构很规整,没有那么多的超参数,专注于构建简单的网络,都是几个卷积层后面跟一个可以压缩图像大小的池化层。即:全部使用3X3的小型卷积核和2X2的最大池化层。
由于深度以及全连接节点数量的原因,VGG16的weights超过533MB,VGG19超过574MB,这使得部署VGG很令人讨厌。虽然在许多深度学习图像分类问题中仍在使用VGG架构,但是小规模的网络架构更受欢迎(比如SqueezeNet,GoogleNet等等)。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import torch
from torch import nn

class VGG16(nn.Module):
'''
VGG16网络
INPUT -> 图像规格(224, 224, 3), 待分类数(1000)
'''
def __init__(self):
super(VGG16, self).__init__()
self.block1 = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False),
# 卷积后图像尺寸 (224+1*2-3)/步长+1 = 224
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1, bias=False),
# 卷积后图像尺寸 (224+1*2-3)/步长+1 = 224
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2)
# 经过池化层后图像尺寸 (224-2)/步长+1 = 112
)
self.block2 = nn.Sequential(
nn.Conv2d(64, 128, kernel_size=3, padding=1, bias=False),
# 卷积后图像尺寸 (112+1*2-3)/步长+1 = 112
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False),
# 卷积后图像尺寸 (112+1*2-3)/步长+1 = 112
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2)
# 经过池化层后图像尺寸 (112-2)/步长+1 = 56
)
self.block3 = nn.Sequential(
nn.Conv2d(128, 256, kernel_size=3, padding=1, bias=False),
# 卷积后图像尺寸 (56+1*2-3)/步长+1 = 56
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=False),
# 卷积后图像尺寸 (56+1*2-3)/步长+1 = 56
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=1, bias=False),
# 卷积后图像尺寸 (56-1)/步长+1 = 56
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2)
# 经过池化层后图像尺寸 (56-2)/步长+1 = 28
)
self.block4 = nn.Sequential(
nn.Conv2d(256, 512, kernel_size=3, padding=1, bias=False),
# 卷积后图像尺寸 (28+1*2-3)/步长+1 = 28
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=False),
# 卷积后图像尺寸 (28+1*2-3)/步长+1 = 28
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=False),
# 卷积后图像尺寸 (28+1*2-3)/步长+1 = 28
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2)
# 经过池化层后图像尺寸 (28-2)/步长+1 = 14
)
self.block5 = nn.Sequential(
nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=False),
# 卷积后图像尺寸 (14+1*2-3)/步长+1 = 14
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=False),
# 卷积后图像尺寸 (14+1*2-3)/步长+1 = 14
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=False),
# 卷积后图像尺寸 (14+1*2-3)/步长+1 = 14
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2)
# 经过池化层后图像尺寸 (14-2)/步长+1 = 7
)

self.classifier = nn.Sequential(
nn.Linear(512*7*7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 1000)
)

def forward(self, x):
x = self.block1(x)
x = self.block2(x)
x = self.block3(x)
x = self.block4(x)
x = self.block5(x)
x = x.view(x.size(0), 512*7*7)
out = self.classifier(x)
return out

model = VGG16()
print(model)

input_tensor = torch.randn((1, 3, 224, 224))
input_var = torch.autograd.Variable(input_tensor)
out = model(input_var)

ResNet

深度学习网络的深度对最后的分类和识别的效果有着很大的影响,所以正常想法就是能把网络设计的越深越好,一个非常深的网络的主要好处是:它可以代表非常复杂的函数。它还可以从许多不同层次的抽象中学习特征,从边缘(在较低的层)到非常复杂的特性(在更深的层次)。然而,使用更深层的网络并不总是有用的,其中原因之一即是网络越深,梯度消失的现象就越来越明显,网络的训练效果也不会很好。
在ResNet出现之前,人们通过BN,Relu等方式去缓解此问题,但仍然不能把网络做到足够深。2015年,ImageNet第一名采用了一种新的网络结构ResNet突破了这一难题。其中最核心的部分是一种基于恒等映射的残差学习结构(Residual_Block),建立前面层与后面层之间的“短路连接”,使得神经网络可以构建得更深。

左边的是标准的残差学习模块,右边是针对50层以上优化后的结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import torch
import torch.nn as nn
import math

#========================================================
# 基础结构
#========================================================

class BasicBlock(nn.Module):
expansion = 1
def __init__(self, input_planes, filters, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(input_planes, filters, kernel_size=3, stride=stride, padding=1, bias=False),
nn.BatchNorm2d(filters),
nn.ReLU(inplace=True)
)
# 卷积后图像尺寸不变, 通道数从input_planes->filters
self.conv2 = nn.Sequential(
nn.Conv2d(filters, filters, kernel_size=3, stride=1, padding=1, bias=False),
nn.BatchNorm2d(filters)
)
# 卷积后图像尺寸不变, 通道数filters
self.downsample = downsample
# 下采样, 调整特征通道(输出通道的倍数)和图像尺寸(是否减半)
self.relu = nn.ReLU(inplace=True)

def forward(self, x):
residual = x
out = self.conv1(x)
out = self.conv2(out)
# 是否执行下采样
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out

#========================================================
# 瓶颈结构
#========================================================

class BottleneckBlock(nn.Module):
expansion = 4
def __init__(self, input_planes, filters, stride=1, downsample=None):
super(BottleneckBlock, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(input_planes, filters, kernel_size=1, stride=1, bias=False),
nn.BatchNorm2d(filters)
)
# 卷积后图像尺寸不变, 通道数从input_planes->filters
self.conv2 = nn.Sequential(
nn.Conv2d(filters, filters, kernel_size=3, stride=stride, padding=1, bias=False),
nn.BatchNorm2d(filters)
)
# 卷积后图像尺寸不变, 通道数filters
self.conv3 = nn.Sequential(
nn.Conv2d(filters, filters*4, kernel_size=1, stride=1, bias=False),
nn.BatchNorm2d(filters*4)
)
# 卷积后图像尺寸不变, 通道数从filters->filters*4
self.downsample = downsample
# 下采样, 调整特征通道(输出通道的倍数)和图像尺寸(是否减半)
self.relu = nn.ReLU(inplace=True)

def forward(self, x):
residual = x
out = self.conv1(x)
out = self.conv2(out)
out = self.conv3(out)
# 是否执行下采样
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out

#========================================================
# 网络框架
#========================================================

class ResNet(nn.Module):
'''
ResNet网络
INPUT -> 输入数据格式(224, 224, 3), 待分类数(1000)
'''
def __init__(self, block, block_config):
self.inplanes = 64
super(ResNet, self).__init__()
self.part0 = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
# 卷积后图像尺寸 (224+3*2-7)/步长+1 = 112
nn.BatchNorm2d(64),
nn.ReLU(inplace=True)
)
self.part1 = self.__combine__(block, 64, block_config[0])
# 图像尺寸不变(112), 通道数 64 * block.expansion
self.part2 = self.__combine__(block, 128, block_config[1], stride=2)
# 图像尺寸减半(56), 通道数 128 * block.expansion
self.part3 = self.__combine__(block, 256, block_config[2], stride=2)
# 图像尺寸减半(28), 通道数 256 * block.expansion
self.part4 = self.__combine__(block, 512, block_config[3], stride=2)
# 图像尺寸减半(14), 通道数 512 * block.expansion
self.avgpool = nn.AvgPool2d(kernel_size=14)
# 经过池化层后图像尺寸 (14-14)/步长+1 = 1
self.classifier = nn.Linear(512 * self.expansion, 1000)

def __combine__(self, block, planes, nums, stride=1):
layers = []
self.expansion = block.expansion
# 下采样, 调整特征通道(输出通道的倍数)和图像尺寸(是否减半)
downsample = None
if stride != 1 or self.inplanes != planes*self.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes*self.expansion, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes*self.expansion)
)
layers.append(block(self.inplanes, planes, stride, downsample))
# 残差结构串联
self.inplanes = planes*self.expansion # 输出通道*所使用残差结构的膨胀倍数
for i in range(1, nums):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)

def forward(self, x):
x = self.part0(x)
x = self.part1(x)
x = self.part2(x)
x = self.part3(x)
x = self.part4(x)
x = self.avgpool(x)
x = x.view(x.size(0), 512*self.expansion)
x = self.classifier(x)
return x

#========================================================
# 经典ResNet网络
#========================================================

def resnet18(**kwargs):
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
return model

def resnet34(**kwargs):
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
return model

def resnet50(**kwargs):
model = ResNet(BottleneckBlock, [3, 4, 6, 3], **kwargs)
return model

def resnet101(**kwargs):
model = ResNet(BottleneckBlock, [3, 4, 23, 3], **kwargs)
return model

def resnet152(**kwargs):
model = ResNet(BottleneckBlock, [3, 8, 36, 3], **kwargs)
return model

#========================================================
# 主程序
#========================================================

model = resnet34()
print(model)

input_tensor = torch.randn((1, 3, 224, 224))
input_var = torch.autograd.Variable(input_tensor)
out = model(input_var)

DenseNet

神经网络越深,网络的能力越强,就越有可能过度学习数据集,导致过拟合。第一个深层网络Alexnet网络,它提出了重要的策略dropout,对于提高模型的泛化能力非常有效。alexnet网络至今仍然可以用于很多的任务,这足以说明其鲁棒性。后来BN和数据增强等正则化策略替代dropout也在一定程度上缓解了过拟合的问题。《Deep networks with stochastic depth》通过训练时随机丢掉一些网络层,提高了ResNet的泛化性能。从这里可以看出来一个重要特性,网络的冗余性绝对是存在的,而且不小,通过探索dropout值的比例对性能的影响可以去估算这个冗余。
既然丢掉某些层间连接或者整个层不影响性能,就说明这一层学习到的非线性转变很小,既然转变很小,那么每一层学习几百个通道,还有必要吗?这几百个通道,正是万恶的计算量所在。
考虑到这一点,densenet就同时做了两件事情,一是将网络中的每一层都直接与其前面层相连,提高特征的利用率;二是把网络的每一层设计得很窄,也就是卷积的输出通道数通常很小,只有几十,该层学习非常少的特征图并与输入concat使用。这实现了资源的最大化利用和计算量的压缩。
ImageNet分类数据集上达到同样的准确率,DenseNet所需的参数量不到ResNet的一半,所需的计算量也只有ResNet的一半左右。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import torch
import torch.nn as nn
import torch.nn.functional as F

#========================================================
# 密集结构
#========================================================

class DenseLayer(nn.Sequential):
def __init__(self, num_input_features, growth_rate):
super(DenseLayer, self).__init__()
self.features = nn.Sequential(
nn.BatchNorm2d(num_input_features),
nn.ReLU(inplace=True),
nn.Conv2d(num_input_features, 4*growth_rate, kernel_size=1, stride=1, bias=False),
# 卷积后图像尺寸不变, 通道数改变
nn.BatchNorm2d(4*growth_rate),
nn.ReLU(inplace=True),
nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False)
# 卷积后图像尺寸不变, 通道数改变
)
self.drop_rate = 0

def forward(self, x):
new_features = self.features(x)
# new_features = super(_DenseLayer, self).forward(x)
if self.drop_rate > 0:
new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
return torch.cat([x, new_features], 1)

class DenseBlock(nn.Sequential):
def __init__(self, num_layers, num_input_features, growth_rate):
super(DenseBlock, self).__init__()
for i in range(num_layers):
layer = DenseLayer(num_input_features + i * growth_rate, growth_rate)
self.add_module('denselayer%d' % (i + 1), layer)

#========================================================
# 密集结构间连接部件
#========================================================

class Transition(nn.Sequential):
def __init__(self, num_input_features, num_output_features):
super(Transition, self).__init__()
self.add_module('norm', nn.BatchNorm2d(num_input_features))
self.add_module('relu', nn.ReLU(inplace=True))
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features, kernel_size=1, stride=1, bias=False))
# 经过卷积层后图像尺寸不变, 通道数改变
self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
# 经过池化层后图像尺寸减半

#========================================================
# 网络框架
#========================================================

class DenseNet(nn.Module):
'''
DenseNet网络
INPUT -> 输入数据格式(224, 224, 3), 待分类数(1000)
'''
def __init__(self, num_init_features=64, block_config=(6, 12, 24, 16), growth_rate=32):
super(DenseNet, self).__init__()
self.part0 = nn.Sequential(
nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False),
# 卷积后图像尺寸 (224+3*2-7)/步长+1 = 112
nn.BatchNorm2d(num_init_features),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# 经过池化层后图像尺寸 (112+1*2-3)/步长+1 = 56
)

# 密集连接结构组装
self.part1_4, num_features = self.__combine__(num_init_features=num_init_features, growth_rate=growth_rate, block_config=block_config)

self.avgpool = nn.Sequential(
nn.BatchNorm2d(num_features),
nn.ReLU(inplace=True),
nn.AvgPool2d(kernel_size=7, stride=1)
# 经过池化层后图像尺寸 (7-7)/步长+1 = 1
)
self.classifier = nn.Linear(num_features, 1000)

def __combine__(self, num_init_features, block_config, growth_rate):
layers = []
num_features = num_init_features
for i, num_layers in enumerate(block_config):

block = DenseBlock(num_layers=num_layers, num_input_features=num_features, growth_rate=growth_rate)
layers.append(block)
num_features = num_features + num_layers * growth_rate

if i != len(block_config) - 1:
trans = Transition(num_input_features=num_features, num_output_features=num_features // 2)
# 连接结构使得图像尺寸减半, 通道数减半
layers.append(trans)
num_features = num_features // 2

return nn.Sequential(*layers), num_features

def forward(self, x):
x = self.part0(x)
x = self.part1_4(x)
x = self.avgpool(x)
x = x.view(x.size(0), 1024*1*1)
x = self.classifier(x)
return x

#========================================================
# 经典DenseNet网络
#========================================================

def densenet121(**kwargs):
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16), **kwargs)
return model

def densenet169(**kwargs):
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32), **kwargs)
return model

def densenet201(**kwargs):
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32), **kwargs)
return model

#========================================================
# 主程序
#========================================================

model = densenet121()
print(model)

input_tensor = torch.randn((1, 3, 224, 224))
input_var = torch.autograd.Variable(input_tensor)
out = model(input_var)

拓展阅读

0%