序言
此前雖然研究了一些目標檢測代碼,但終究水平有限,時間也同樣有限。
最近希望可以擠出一些時間,完整分析一個深度學習代碼,同時也便於更好的學習PyTorch。
本次分析的代碼是0.4版本的,後續可能還需要自行學習與最新版本的差異。
目前,將優先以0.4版本進行復現,後續若還有時間,可考慮學習新版PyTorch並對代碼進行升級。
DAY 1
main函數分析
訓練時調用training.py,而其運行時只調用了main()函數,故先分析main
main函數中使用了logging,並採用了另一個py文件作爲config信息的輸入
其中sys.argv[0]爲運行程序本身的信息,sys.argv[1:]爲後續的相應輸入
導入信息後,將根據GPU數量進行batch計算
logging.basicConfig(level=logging.DEBUG,format="[%(asctime)s %(filename)s] %(message)s")
if len(sys.argv) != 2:
logging.error("Usage: python training.py params.py")
sys.exit()
params_path = sys.argv[1]
if not os.path.isfile(params_path):
logging.error("no params file found! path: {}".format(params_path))
sys.exit()
config = importlib.import_module(params_path[:-3]).TRAINING_PARAMS
config["batch_size"] *= len(config["parallels"])
創立工作空間並存入log(用於權重保存)
# Create sub_working_dir
sub_working_dir = '{}/{}/size{}x{}_try{}/{}'.format(
config['working_dir'], config['model_params']['backbone_name'],
config['img_w'], config['img_h'], config['try'],
time.strftime("%Y%m%d%H%M%S", time.localtime()))
if not os.path.exists(sub_working_dir):
os.makedirs(sub_working_dir)
config["sub_working_dir"] = sub_working_dir
logging.info("sub working dir: %s" % sub_working_dir)
然後是創建summary writer,設置GPU,調用train函數
# Creat tf_summary writer
config["tensorboard_writer"] = SummaryWriter(sub_working_dir)
logging.info("Please using 'python -m tensorboard.main --logdir={}'".format(sub_working_dir))
# Start training
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, config["parallels"]))
train(config)
main函數總結:設置log,讀取輸入,讀入config並調整,創建summary writer,調用train函數
train函數分析
train函數的輸入是dict類型的config,本節只介紹train函數的整體流程及功能,涉及的類後續將獨立分析。
train函數設置了global_step和is_training,後者訓練時爲True
實例化ModelMain,加載並初始化網絡,設置爲可訓練
config["global_step"] = config.get("start_step", 0)
is_training = False if config.get("export_onnx") else True
# Load and initialize network
net = ModelMain(config, is_training=is_training)
net.train(is_training)
設置優化器(利用torch.nn.Parameter()生成參數的列表,用於優化器設置,優化器中可選取參數)
參數包括 learning rate 和weight_decay(即L2正則化)
# Optimizer and learning rate
optimizer = _get_optimizer(config, net)
lr_scheduler = optim.lr_scheduler.StepLR(
optimizer,
step_size=config["lr"]["decay_step"],
gamma=config["lr"]["decay_gamma"])
def _get_optimizer(config, net):
optimizer = None
# Assign different lr for each layer
params = None
base_params = list(
map(id, net.backbone.parameters())
)
logits_params = filter(lambda p: id(p) not in base_params, net.parameters())
if not config["lr"]["freeze_backbone"]:
params = [
{"params": logits_params, "lr": config["lr"]["other_lr"]},
{"params": net.backbone.parameters(), "lr": config["lr"]["backbone_lr"]},
]
else:
logging.info("freeze backbone's parameters.")
for p in net.backbone.parameters():
p.requires_grad = False
params = [
{"params": logits_params, "lr": config["lr"]["other_lr"]},
]
# Initialize optimizer class
if config["optimizer"]["type"] == "adam":
optimizer = optim.Adam(params, weight_decay=config["optimizer"]["weight_decay"])
elif config["optimizer"]["type"] == "amsgrad":
optimizer = optim.Adam(params, weight_decay=config["optimizer"]["weight_decay"],
amsgrad=True)
elif config["optimizer"]["type"] == "rmsprop":
optimizer = optim.RMSprop(params, weight_decay=config["optimizer"]["weight_decay"])
else:
# Default to sgd
logging.info("Using SGD optimizer.")
optimizer = optim.SGD(params, momentum=0.9,
weight_decay=config["optimizer"]["weight_decay"],
nesterov=(config["optimizer"]["type"] == "nesterov"))
return optimizer
設置多GPU,並將net轉移至GPU
# Set data parallel
net = nn.DataParallel(net)
net = net.cuda()
恢復預訓練權重
# Restore pretrain model
if config["pretrain_snapshot"]:
logging.info("Load pretrained weights from {}".format(config["pretrain_snapshot"]))
state_dict = torch.load(config["pretrain_snapshot"])
net.load_state_dict(state_dict)
創建損失並加載數據(未細化理解)
# YOLO loss with 3 scales
yolo_losses = []
for i in range(3):
yolo_losses.append(YOLOLoss(config["yolo"]["anchors"][i],
config["yolo"]["classes"], (config["img_w"], config["img_h"])))
# DataLoader
dataloader = torch.utils.data.DataLoader(COCODataset(config["train_path"],
(config["img_w"], config["img_h"]),
is_training=True),
batch_size=config["batch_size"],
shuffle=True, num_workers=32, pin_memory=True)
開始訓練,每次將獲取數據和標籤,然後根據不同anchor分別計算損失並返向傳播,每隔固定步數保存參數
# Start the training loop
logging.info("Start training.")
for epoch in range(config["epochs"]):
for step, samples in enumerate(dataloader):
images, labels = samples["image"], samples["label"]
start_time = time.time()
config["global_step"] += 1
# Forward and backward
optimizer.zero_grad()
outputs = net(images)
losses_name = ["total_loss", "x", "y", "w", "h", "conf", "cls"]
losses = []
for _ in range(len(losses_name)):
losses.append([])
for i in range(3):
_loss_item = yolo_losses[i](outputs[i], labels)
for j, l in enumerate(_loss_item):
losses[j].append(l)
losses = [sum(l) for l in losses]
loss = losses[0]
loss.backward()
optimizer.step()
if step > 0 and step % 10 == 0:
_loss = loss.item()
duration = float(time.time() - start_time)
example_per_second = config["batch_size"] / duration
lr = optimizer.param_groups[0]['lr']
logging.info(
"epoch [%.3d] iter = %d loss = %.2f example/sec = %.3f lr = %.5f "%
(epoch, step, _loss, example_per_second, lr)
)
config["tensorboard_writer"].add_scalar("lr",
lr,
config["global_step"])
config["tensorboard_writer"].add_scalar("example/sec",
example_per_second,
config["global_step"])
for i, name in enumerate(losses_name):
value = _loss if i == 0 else losses[i]
config["tensorboard_writer"].add_scalar(name,
value,
config["global_step"])
if step > 0 and step % 1000 == 0:
# net.train(False)
_save_checkpoint(net.state_dict(), config)
# net.train(True)
lr_scheduler.step()
# net.train(False)
_save_checkpoint(net.state_dict(), config)
# net.train(True)
logging.info("Bye~")
其中保存參數的函數如下:
def _save_checkpoint(state_dict, config, evaluate_func=None):
# global best_eval_result
checkpoint_path = os.path.join(config["sub_working_dir"], "model.pth")
torch.save(state_dict, checkpoint_path)
logging.info("Model checkpoint saved to %s" % checkpoint_path)
第一天的代碼分析結束,僅分析了training.py,明天繼續
DAY 2
今天的目標是分析train函數用到的其他函數和類,目前初步的復現思路是先學習所有訓練需要的函數和類,然後逐一文件重構,直至全部替換。
train函數的流程 ->config->net(init)->optimizer->cuda->load model->loss(build)->dataloader->start train
start train的流程->get images & labels->zero_grad->loss_calc->grad_calc->back_propagation->save_model
config由之前的main函數通過importlib動態導入。
net是model_main.py下的class ModelMain,負責網絡的構建
# Load and initialize network
net = ModelMain(config, is_training=is_training)
net.train(is_training)
以下將對ModelMain類進行分析
首先是__init__函數,進行了必須config信息的傳入,backbone的構建,以及三層embedding的構建(應該是三種不同尺度的預測)
以下是信息傳入
def __init__(self, config, is_training=True):
super(ModelMain, self).__init__()
self.config = config
self.training = is_training
self.model_params = config["model_params"]
現在對backbone部分進行分析
首先類的選擇,此代碼將nets文件夾和backbone文件夾均寫成了package,_backbone_fn採用了類darknet.darknet53,並進行了實例化和卷積層數信息的提取。
# backbone
_backbone_fn = backbone_fn[self.model_params["backbone_name"]]
self.backbone = _backbone_fn(self.model_params["backbone_pretrained"])
_out_filters = self.backbone.layers_out_filters
下面分析darknet53,此類調用了Darknet生成模型並加載模型
def darknet53(pretrained, **kwargs):
"""Constructs a darknet-53 model.
"""
model = DarkNet([1, 2, 8, 8, 4])
if pretrained:
if isinstance(pretrained, str):
model.load_state_dict(torch.load(pretrained))
else:
raise Exception("darknet request a pretrained path. got [{}]".format(pretrained))
return model
Darknet如下,結構圖後續可見,使用時DarkNet53中的最後一層需要去除
此代碼在構建時,首先單獨構造了第一層卷積,而之後的每一個卷積將和其後續的模塊一同構建,也就是_make_layer函數,其中blocks是循環次數,planes則是模塊中兩次卷積的核的個數。模塊則是由BasicBlock模塊構建。
Darknet一共返回三種降採樣級別的特徵。
class DarkNet(nn.Module):
def __init__(self, layers):
super(DarkNet, self).__init__()
self.inplanes = 32
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(self.inplanes)
self.relu1 = nn.LeakyReLU(0.1)
self.layer1 = self._make_layer([32, 64], layers[0])
self.layer2 = self._make_layer([64, 128], layers[1])
self.layer3 = self._make_layer([128, 256], layers[2])
self.layer4 = self._make_layer([256, 512], layers[3])
self.layer5 = self._make_layer([512, 1024], layers[4])
self.layers_out_filters = [64, 128, 256, 512, 1024]
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, planes, blocks):
layers = []
# downsample
layers.append(("ds_conv", nn.Conv2d(self.inplanes, planes[1], kernel_size=3,
stride=2, padding=1, bias=False)))
layers.append(("ds_bn", nn.BatchNorm2d(planes[1])))
layers.append(("ds_relu", nn.LeakyReLU(0.1)))
# blocks
self.inplanes = planes[1]
for i in range(0, blocks):
layers.append(("residual_{}".format(i), BasicBlock(self.inplanes, planes)))
return nn.Sequential(OrderedDict(layers))
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.layer1(x)
x = self.layer2(x)
out3 = self.layer3(x)
out4 = self.layer4(out3)
out5 = self.layer5(out4)
return out3, out4, out5
BasicBlock模塊是用於構建殘差模塊,代碼如下:
class BasicBlock(nn.Module):
def __init__(self, inplanes, planes):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes[0], kernel_size=1,
stride=1, padding=0, bias=False)
self.bn1 = nn.BatchNorm2d(planes[0])
self.relu1 = nn.LeakyReLU(0.1)
self.conv2 = nn.Conv2d(planes[0], planes[1], kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes[1])
self.relu2 = nn.LeakyReLU(0.1)
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu1(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu2(out)
out += residual
return out
此時基網絡構建完成,以上的代碼均爲下面兩句所調用
# backbone
_backbone_fn = backbone_fn[self.model_params["backbone_name"]]
self.backbone = _backbone_fn(self.model_params["backbone_pretrained"])
_out_filters = self.backbone.layers_out_filters