Spaces:
Build error
Build error
| import torch | |
| from argparse import Namespace | |
| import torchvision.transforms as transforms | |
| import clip | |
| import numpy as np | |
| import sys | |
| sys.path.append(".") | |
| sys.path.append("..") | |
| from models.e4e_features import pSp | |
| from adapter.adapter_decoder import CLIPAdapterWithDecoder | |
| import gradio as gr | |
| def tensor2im(var): | |
| var = var.cpu().detach().transpose(0, 2).transpose(0, 1).numpy() | |
| var = ((var + 1) / 2) | |
| var[var < 0] = 0 | |
| var[var > 1] = 1 | |
| var = var * 255 | |
| return var.astype('uint8') | |
| def run_alignment(image_path): | |
| import dlib | |
| from align_faces_parallel import align_face | |
| predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat") | |
| aligned_image = align_face(image_path, predictor=predictor) | |
| # print("Aligned image has shape: {}".format(aligned_image.size)) | |
| return aligned_image | |
| input_transforms = transforms.Compose([ | |
| transforms.Resize((256, 256)), | |
| transforms.ToTensor(), | |
| transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])]) | |
| model_path = 'pretrained_faces.pt' | |
| e4e_path = 'e4e_ffhq_encode.pt' | |
| ckpt = torch.load(model_path, map_location='cpu') | |
| opts = ckpt['opts'] | |
| opts['checkpoint_path'] = model_path | |
| opts['pretrained_e4e_path'] = e4e_path | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| opts['device'] = device | |
| opts = Namespace(**opts) | |
| encoder = pSp(opts) | |
| encoder.eval() | |
| encoder.to(device) | |
| adapter = CLIPAdapterWithDecoder(opts) | |
| adapter.eval() | |
| adapter.to(device) | |
| clip_model, _ = clip.load("ViT-B/32", device=device) | |
| def manipulate(input_image, caption): | |
| if not isinstance(input_image, Image.Image): | |
| input_image = Image.open(input_image).convert('RGB') | |
| else: | |
| input_image = input_image.convert('RGB') | |
| aligned_image = run_alignment(input_image) | |
| input_image = input_transforms(aligned_image) | |
| input_image = input_image.unsqueeze(0) | |
| text_input = clip.tokenize(caption) | |
| text_input = text_input.to(device) | |
| input_image = input_image.to(device).float() | |
| with torch.no_grad(): | |
| text_features = clip_model.encode_text(text_input).float() | |
| w, features = encoder.forward(input_image, return_latents=True) | |
| features = adapter.adapter(features, text_features) | |
| w_hat = w + 0.1 * encoder.forward_features(features) | |
| result_tensor, _ = adapter.decoder([w_hat], input_is_latent=True, return_latents=False, randomize_noise=False, truncation=1, txt_embed=text_features) | |
| result_tensor = result_tensor.squeeze(0) | |
| result_image = tensor2im(result_tensor) | |
| return result_image | |
| gr.Interface(fn=manipulate, | |
| inputs=[gr.Image(type="pil"), "text"], | |
| outputs="image", | |
| examples=[['example.jpg', "He has mustache"]], | |
| title="CLIPInverter").launch() | |