PSGAN 初步搭建與訓練
在閱讀了妝容遷移的相關論文,比如PSGAN、BeautyGAN、SPADE、StarGAN v2等之後,我開始使用PyTorch初步復現PSGAN網絡結構。
網絡結構部分代碼如下:
# Makeup Apply Network(MANet)
class Generator(nn.Module):
"""Generator. Encoder-Decoder Architecture."""
def __init__(self, conv_dim=64, repeat_num=6):
super(Generator, self).__init__()
encoder_layers = []
encoder_layers.append(nn.Conv2d(3, conv_dim, kernel_size=7, stride=1, padding=3, bias=False))
# MANet設置沒有affine
encoder_layers.append(nn.InstanceNorm2d(conv_dim, affine=False))
encoder_layers.append(nn.ReLU(inplace=True))
# Down-Sampling
curr_dim = conv_dim
for i in range(2):
encoder_layers.append(nn.Conv2d(curr_dim, curr_dim * 2, kernel_size=4, stride=2, padding=1, bias=False))
encoder_layers.append(nn.InstanceNorm2d(curr_dim * 2, affine=False))
encoder_layers.append(nn.ReLU(inplace=True))
curr_dim = curr_dim * 2
# Bottleneck
for i in range(3):
encoder_layers.append(ResidualBlock(dim_in=curr_dim, dim_out=curr_dim))
decoder_layers = []
for i in range(3):
decoder_layers.append(ResidualBlock(dim_in=curr_dim, dim_out=curr_dim))
# Up-Sampling
for i in range(2):
decoder_layers.append(
nn.ConvTranspose2d(curr_dim, curr_dim // 2, kernel_size=4, stride=2, padding=1, bias=False))
decoder_layers.append(nn.InstanceNorm2d(curr_dim // 2, affine=True))
decoder_layers.append(nn.ReLU(inplace=True))
curr_dim = curr_dim // 2
decoder_layers.append(nn.Conv2d(curr_dim, 3, kernel_size=7, stride=1, padding=3, bias=False))
decoder_layers.append(nn.Tanh())
self.encoder = nn.Sequential(*encoder_layers)
self.decoder = nn.Sequential(*decoder_layers)
self.MDNet = MDNet()
self.AMM = AMM()
def forward(self, source_image, reference_image):
fm_source = self.encoder(source_image)
fm_reference = self.MDNet(reference_image)
morphed_fm = self.AMM(fm_source, fm_reference)
result = self.decoder(morphed_fm)
return result
class MDNet(nn.Module):
"""Generator. Encoder-Decoder Architecture."""
# MDNet is similar to the encoder of StarGAN
def __init__(self, conv_dim=64, repeat_num=3):
super(MDNet, self).__init__()
layers = []
layers.append(nn.Conv2d(3, conv_dim, kernel_size=7, stride=1, padding=3, bias=False))
layers.append(nn.InstanceNorm2d(conv_dim, affine=True))
layers.append(nn.ReLU(inplace=True))
# Down-Sampling
curr_dim = conv_dim
for i in range(2):
layers.append(nn.Conv2d(curr_dim, curr_dim * 2, kernel_size=4, stride=2, padding=1, bias=False))
layers.append(nn.InstanceNorm2d(curr_dim * 2, affine=True))
layers.append(nn.ReLU(inplace=True))
curr_dim = curr_dim * 2
# Bottleneck
for i in range(repeat_num):
layers.append(ResidualBlock(dim_in=curr_dim, dim_out=curr_dim))
self.main = nn.Sequential(*layers)
def forward(self, reference_image):
fm_reference = self.main(reference_image)
return fm_reference
# AMM暫時先不用landmark detector,先試試一般的attention效果如何
# feature map也先不乘以visual_feature_weight
# 這裏attention部分的計算也先存疑,因爲論文中提到x和y需要是同一個區域,但結構圖中是softmax
class AMM(nn.Module):
"""Attentive Makeup Morphing module"""
def __init__(self):
super(AMM, self).__init__()
self.visual_feature_weight = 0.01
self.lambda_matrix_conv = nn.Conv2d(in_channels=256, out_channels=1, kernel_size=1)
self.beta_matrix_conv = nn.Conv2d(in_channels=256, out_channels=1, kernel_size=1)
self.softmax = nn.Softmax(dim=-1)
def forward(self, fm_source, fm_reference):
batch_size, channels, width, height = fm_reference.size()
old_lambda_matrix = self.lambda_matrix_conv(fm_reference).view(batch_size, -1, width * height)
old_beta_matrix = self.beta_matrix_conv(fm_reference).view(batch_size, -1, width * height)
# reshape後fm的形狀是C*(H*W)
temp_fm_reference = fm_reference.view(batch_size, -1, height * width)
# print('temp_fm_reference shape: ', temp_fm_reference.shape)
# fm_source 在reshape後需要transpose成(H*W)*C
temp_fm_source = fm_source.view(batch_size, -1, height * width).permute(0, 2, 1)
# print('temp_fm_source shape: ', temp_fm_source.shape)
# energy的形狀應該是N*N,N=H*W
energy = torch.bmm(temp_fm_source, temp_fm_reference)
attention_map = self.softmax(energy)
new_lambda_matrix = torch.bmm(old_lambda_matrix, attention_map.permute(0, 2, 1))
new_beta_matrix = torch.bmm(old_beta_matrix, attention_map.permute(0, 2, 1))
new_lambda_matrix = new_lambda_matrix.view(batch_size, 1, width, height)
new_beta_matrix = new_beta_matrix.view(batch_size, 1, width, height)
# 對feature_map_source進行修改
lambda_tensor = new_lambda_matrix.expand(batch_size, 256, width, height)
beta_tensor = new_beta_matrix.expand(batch_size, 256, width, height)
morphed_fm_source = torch.mul(lambda_tensor, fm_source)
morphed_fm_source = torch.add(morphed_fm_source, beta_tensor)
return morphed_fm_source
訓練數據讀取代碼如下:
def get_loader(data_config, config, mode="train"):
dataset_name = data_config.name
transform = transforms.Compose([
transforms.Resize(config.img_size),
transforms.ToTensor(),
transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])
transform_mask = transforms.Compose([
transforms.Resize(config.img_size, interpolation=PIL.Image.NEAREST),
ToTensor])
print(config.data_path)
# """
if mode == "train":
dataset_train = eval(dataset_name)(data_config.dataset_path, transform=transform, mode="train", \
transform_mask=transform_mask, cls_list=config.cls_list)
dataset_test = eval(dataset_name)(data_config.dataset_path, transform=transform, mode="test", \
transform_mask=transform_mask, cls_list=config.cls_list)
# """
data_loader_train = DataLoader(dataset=dataset_train,
batch_size=config.batch_size,
shuffle=True)
if mode == "test":
data_loader_train = None
dataset_test = eval(dataset_name)(data_config.dataset_path, transform=transform, mode="test", \
transform_mask=transform_mask, cls_list=config.cls_list)
data_loader_test = DataLoader(dataset=dataset_test,
batch_size=1,
shuffle=False)
return [data_loader_train, data_loader_test]
訓練尚未完成,但有最新迭代的妝容遷移效果,訓練效果如下:
第50輪迭代效果:
第49輪迭代效果: