怎样用 Python 控制图片人物动起来?一文就能 Get

责编|李雪敬
头图|CSDN下载自视觉中国
出品|AI科技大本营(ID:rgznai100)
近段时间 , 一个让蒙娜丽莎图像动起来的项目火遍了朋友圈 。 而今天我们就将实现让图片中的人物随着视频人物一起产生动作 。 近年来 , 深度生成模型作为一种有效的图像动画技术出现了视频重定向 。 特别是 , 可生成的对抗网络(GANS)和变分自动编码器(VAES)已被用于在视频中人类受试者之间转换面部表情或运动模式 。
而今天我们就将借助论文所分享的源代码 , 构建模型创建自己需要的人物运动 。 具体流程如下 。 imageio模块用来控制图像的输入输出等 。
Matplotlib模块用来绘图 。
numpy模块用来处理矩阵运算 。
Pillow库用来加载数据处理 。
pytorch模块用来创建模型和模型训练等 。
完整模块需求参见requirements.txt文件 。
(1)首先是训练模型的读取 , 包括模型加载方式:
defload_checkpoints(config_path,checkpoint_path,cpu=False):
withopen(config_path)asf:
config=yaml.load(f)
generator=OcclusionAwareGenerator(**config['model_params']['generator_params'],
**config['model_params']['common_params'])
ifnotcpu:
generator.cuda
kp_detector=KPDetector(**config['model_params']['kp_detector_params'],
**config['model_params']['common_params'])
ifnotcpu:
kp_detector.cuda
ifcpu:
checkpoint=torch.load(checkpoint_path,map_location=torch.device('cpu'))
else:
checkpoint=torch.load(checkpoint_path)
generator.load_state_dict(checkpoint['generator'])
kp_detector.load_state_dict(checkpoint['kp_detector'])
ifnotcpu:
generator=DataParallelWithCallback(generator)
kp_detector=DataParallelWithCallback(kp_detector)
generator.eval
kp_detector.eval
returngenerator,kp_detector
(2)然后是利用模型创建产生的虚拟图像 , 找到最佳的脸部特征:
defmake_animation(source_image,driving_video,generator,kp_detector,relative=True,adapt_movement_scale=True,cpu=False):
withtorch.no_grad:
predictions=
source=torch.tensor(source_image[np.newaxis].astype(np.float32)).permute(0,3,1,2)
ifnotcpu:
source=source.cuda
driving=torch.tensor(np.array(driving_video)[np.newaxis].astype(np.float32)).permute(0,4,1,2,3)
kp_source=kp_detector(source)
kp_driving_initial=kp_detector(driving[:,:,0])
forframe_idxintqdm(range(driving.shape[2])):
driving_frame=driving[:,:,frame_idx]
ifnotcpu:
driving_frame=driving_frame.cuda
kp_driving=kp_detector(driving_frame)
kp_norm=normalize_kp(kp_source=kp_source,kp_driving=kp_driving,
kp_driving_initial=kp_driving_initial,use_relative_movement=relative,
use_relative_jacobian=relative,adapt_movement_scale=adapt_movement_scale)
out=generator(source,kp_source=kp_source,kp_driving=kp_norm)predictions.append(np.transpose(out['prediction'].data.cpu.numpy,[0,2,3,1])[0])
returnpredictions
deffind_best_frame(source,driving,cpu=False):
importface_alignment
defnormalize_kp(kp):
kp=kp-kp.mean(axis=0,keepdims=True)
area=ConvexHull(kp[:,:2]).volume
area=np.sqrt(area)
kp[:,:2]=kp[:,:2]/area
returnkp
fa=face_alignment.FaceAlignment(face_alignment.LandmarksType._2D,flip_input=True,
device='cpu'ifcpuelse'cuda')
kp_source=fa.get_landmarks(255*source)[0]
kp_source=normalize_kp(kp_source)
norm=float('inf')
frame_num=0
fori,imageintqdm(enumerate(driving)):
kp_driving=fa.get_landmarks(255*image)[0]
kp_driving=normalize_kp(kp_driving)
new_norm=(np.abs(kp_source-kp_driving)**2).sum
ifnew_norm<norm:
norm=new_norm
frame_num=i
returnframe_num
(3)接着定义命令行调用参数加载图片、视频等方式:
parser=ArgumentParser
parser.add_argument("--config",required=True,help="pathtoconfig")
parser.add_argument("--checkpoint",default='vox-cpk.pth.tar',help="pathtocheckpointtorestore")
parser.add_argument("--source_image",default='sup-mat/source.png',help="pathtosourceimage")
parser.add_argument("--driving_video",default='sup-mat/source.png',help="pathtodrivingvideo")
parser.add_argument("--result_video",default='result.mp4',help="pathtooutput")
parser.add_argument("--relative",dest="relative",action="store_true",help="userelativeorabsolutekeypointcoordinates")
parser.add_argument("--adapt_scale",dest="adapt_scale",action="store_true",help="adaptmovementscalebasedonconvexhullofkeypoints")
parser.add_argument("--find_best_frame",dest="find_best_frame",action="store_true",
help="Generatefromtheframethatisthemostallignedwithsource.(Onlyforfaces,requiresface_aligmentlib)")
parser.add_argument("--best_frame",dest="best_frame",type=int,default=None,
help="Setframetostartfrom.")
parser.add_argument("--cpu",dest="cpu",action="store_true",help="cpumode.")
parser.set_defaults(relative=False)
parser.set_defaults(adapt_scale=False)
opt=parser.parse_args
source_image=imageio.imread(opt.source_image)
reader=imageio.get_reader(opt.driving_video)
fps=reader.get_meta_data['fps']
driving_video=
try:
foriminreader:
driving_video.append(im)
exceptRuntimeError:
pass
reader.close
source_image=resize(source_image,(256,256))[...,:3]
driving_video=[resize(frame,(256,256))[...,:3]forframeindriving_video]
generator,kp_detector=load_checkpoints(config_path=opt.config,checkpoint_path=opt.checkpoint,cpu=opt.cpu)
ifopt.find_best_frameoropt.best_frameisnotNone:
i=opt.best_frameifopt.best_frameisnotNoneelsefind_best_frame(source_image,driving_video,cpu=opt.cpu)
print("Bestframe:"+str(i))
driving_forward=driving_video[i:]
driving_backward=driving_video[:(i+1)][::-1]
predictions_forward=make_animation(source_image,driving_forward,generator,kp_detector,relative=opt.relative,adapt_movement_scale=opt.adapt_scale,cpu=opt.cpu)
predictions_backward=make_animation(source_image,driving_backward,generator,kp_detector,relative=opt.relative,adapt_movement_scale=opt.adapt_scale,cpu=opt.cpu)
predictions=predictions_backward[::-1]+predictions_forward[1:]
else:
predictions=make_animation(source_image,driving_video,generator,kp_detector,relative=opt.relative,adapt_movement_scale=opt.adapt_scale,cpu=opt.cpu)
imageio.mimsave(opt.result_video,[img_as_ubyte(frame)forframeinpredictions],fps=fps)
(1)其中通过定义VGG19模型建立网络层作为perceptual损失 。
其中手动输入数据进行预测需要设置更多的GUI按钮 , 其中代码如下:
classVgg19(torch.nn.Module):
【怎样用 Python 控制图片人物动起来?一文就能 Get】"""
Vgg19networkforperceptualloss.SeeSec3.3.
"""
def__init__(self,requires_grad=False):
super(Vgg19,self).__init__
vgg_pretrained_features=models.vgg19(pretrained=True).features
self.slice1=torch.nn.Sequential
self.slice2=torch.nn.Sequential
self.slice3=torch.nn.Sequential
self.slice4=torch.nn.Sequential
self.slice5=torch.nn.Sequential
forxinrange(2):
self.slice1.add_module(str(x),vgg_pretrained_features[x])
forxinrange(2,7):
self.slice2.add_module(str(x),vgg_pretrained_features[x])
forxinrange(7,12):
self.slice3.add_module(str(x),vgg_pretrained_features[x])
forxinrange(12,21):
self.slice4.add_module(str(x),vgg_pretrained_features[x])
forxinrange(21,30):
self.slice5.add_module(str(x),vgg_pretrained_features[x])
self.mean=torch.nn.Parameter(data=https://pcff.toutiao.jxnews.com.cn/p/20200913/torch.Tensor(np.array([0.485,0.456,0.406]).reshape((1,3,1,1))),
requires_grad=False)
self.std=torch.nn.Parameter(data=https://pcff.toutiao.jxnews.com.cn/p/20200913/torch.Tensor(np.array([0.229,0.224,0.225]).reshape((1,3,1,1))),
requires_grad=False)
ifnotrequires_grad:
forparaminself.parameters:
param.requires_grad=False
defforward(self,X):
X=(X-self.mean)/self.std
h_relu1=self.slice1(X)
h_relu2=self.slice2(h_relu1)
h_relu3=self.slice3(h_relu2)
h_relu4=self.slice4(h_relu3)
h_relu5=self.slice5(h_relu4)
out=[h_relu1,h_relu2,h_relu3,h_relu4,h_relu5]
returnout
(2)创建图像金字塔计算金字塔感知损失:
classImagePyramide(torch.nn.Module):
"""
Createimagepyramideforcomputingpyramideperceptualloss.SeeSec3.3
"""
def__init__(self,scales,num_channels):
super(ImagePyramide,self).__init__
downs={}
forscaleinscales:
downs[str(scale).replace('.','-')]=AntiAliasInterpolation2d(num_channels,scale)
self.downs=nn.ModuleDict(downs)
defforward(self,x):
out_dict={}
forscale,down_moduleinself.downs.items:
out_dict['prediction_'+str(scale).replace('-','.')]=down_module(x)
returnout_dict
(3)等方差约束的随机tps变换
classTransform:
"""
Randomtpstransformationforequivarianceconstraints.SeeSec3.3
"""
def__init__(self,bs,**kwargs):
noise=torch.normal(mean=0,std=kwargs['sigma_affine']*torch.ones([bs,2,3]))
self.theta=noise+torch.eye(2,3).view(1,2,3)
self.bs=bs
if('sigma_tps'inkwargs)and('points_tps'inkwargs):
self.tps=True
self.control_points=make_coordinate_grid((kwargs['points_tps'],kwargs['points_tps']),type=noise.type)
self.control_points=self.control_points.unsqueeze(0)
self.control_params=torch.normal(mean=0,
std=kwargs['sigma_tps']*torch.ones([bs,1,kwargs['points_tps']**2]))
else:
self.tps=False
deftransform_frame(self,frame):
grid=make_coordinate_grid(frame.shape[2:],type=frame.type).unsqueeze(0)
grid=grid.view(1,frame.shape[2]*frame.shape[3],2)
grid=self.warp_coordinates(grid).view(self.bs,frame.shape[2],frame.shape[3],2)
returnF.grid_sample(frame,grid,padding_mode="reflection")
defwarp_coordinates(self,coordinates):
theta=self.theta.type(coordinates.type)
theta=theta.unsqueeze(1)
transformed=torch.matmul(theta[:,:,:,:2],coordinates.unsqueeze(-1))+theta[:,:,:,2:]
transformed=transformed.squeeze(-1)
ifself.tps:
control_points=self.control_points.type(coordinates.type)
control_params=self.control_params.type(coordinates.type)
distances=coordinates.view(coordinates.shape[0],-1,1,2)-control_points.view(1,1,-1,2)
distances=torch.abs(distances).sum(-1)
result=distances**2
result=result*torch.log(distances+1e-6)
result=result*control_params
result=result.sum(dim=2).view(self.bs,coordinates.shape[1],1)
transformed=transformed+result
returntransformed
defjacobian(self,coordinates):
new_coordinates=self.warp_coordinates(coordinates)
grad_x=grad(new_coordinates[...,0].sum,coordinates,create_graph=True)
grad_y=grad(new_coordinates[...,1].sum,coordinates,create_graph=True)
jacobian=torch.cat([grad_x[0].unsqueeze(-2),grad_y[0].unsqueeze(-2)],dim=-2)
returnjacobian
def__init__(self,num_channels,num_kp,block_expansion,max_features,num_down_blocks,
num_bottleneck_blocks,estimate_occlusion_map=False,dense_motion_params=None,estimate_jacobian=False):
super(OcclusionAwareGenerator,self).__init__
ifdense_motion_paramsisnotNone:
self.dense_motion_network=DenseMotionNetwork(num_kp=num_kp,num_channels=num_channels,
estimate_occlusion_map=estimate_occlusion_map,
**dense_motion_params)
else:
self.dense_motion_network=None
self.first=SameBlock2d(num_channels,block_expansion,kernel_size=(7,7),padding=(3,3))
down_blocks=
foriinrange(num_down_blocks):
in_features=min(max_features,block_expansion*(2**i))
out_features=min(max_features,block_expansion*(2**(i+1)))
down_blocks.append(DownBlock2d(in_features,out_features,kernel_size=(3,3),padding=(1,1)))
self.down_blocks=nn.ModuleList(down_blocks)
up_blocks=
foriinrange(num_down_blocks):
in_features=min(max_features,block_expansion*(2**(num_down_blocks-i)))
out_features=min(max_features,block_expansion*(2**(num_down_blocks-i-1)))
up_blocks.append(UpBlock2d(in_features,out_features,kernel_size=(3,3),padding=(1,1)))
self.up_blocks=nn.ModuleList(up_blocks)
self.bottleneck=torch.nn.Sequential
in_features=min(max_features,block_expansion*(2**num_down_blocks))
foriinrange(num_bottleneck_blocks):
self.bottleneck.add_module('r'+str(i),ResBlock2d(in_features,kernel_size=(3,3),padding=(1,1)))
self.final=nn.Conv2d(block_expansion,num_channels,kernel_size=(7,7),padding=(3,3))
self.estimate_occlusion_map=estimate_occlusion_map
self.num_channels=num_channels
(5)判别器类似于Pix2PixGenerator 。
def__init__(self,num_channels=3,block_expansion=64,num_blocks=4,max_features=512,
sn=False,use_kp=False,num_kp=10,kp_variance=0.01,**kwargs):
super(Discriminator,self).__init__
down_blocks=
foriinrange(num_blocks):
down_blocks.append(
DownBlock2d(num_channels+num_kp*use_kpifi==0elsemin(max_features,block_expansion*(2**i)),
min(max_features,block_expansion*(2**(i+1))),
norm=(i!=0),kernel_size=4,pool=(i!=num_blocks-1),sn=sn))
self.down_blocks=nn.ModuleList(down_blocks)
self.conv=nn.Conv2d(self.down_blocks[-1].conv.out_channels,out_channels=1,kernel_size=1)
ifsn:
self.conv=nn.utils.spectral_norm(self.conv)
self.use_kp=use_kp
self.kp_variance=kp_variance
defforward(self,x,kp=None):
feature_maps=
out=x
ifself.use_kp:
heatmap=kp2gaussian(kp,x.shape[2:],self.kp_variance)
out=torch.cat([out,heatmap],dim=1)
fordown_blockinself.down_blocks:
feature_maps.append(down_block(out))
out=feature_maps[-1]
prediction_map=self.conv(out)
returnfeature_maps,prediction_map
最终通过以下代码调用模型训练“pythondemo.py--configconfig/vox-adv-256.yaml--driving_videopath/to/driving/1.mp4--source_imagepath/to/source/7.jpg--checkpointpath/to/checkpoint/vox-adv-cpk.pth.tar--relative--adapt_scale”效果如下:
提取码:e4kx
作者简介:
李秋键 , CSDN博客专家 , CSDN达人课作者 。 硕士在读于中国矿业大学 , 开发有taptap竞赛获奖等等 。


    推荐阅读