InstantID-FaceID-6M

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Mar 24

Commit

d1d8628

verified ·

1 Parent(s): 2bf7ead

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -117

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import spaces
 import PIL
 from PIL import Image
-from typing import Tuple
 import diffusers
 from diffusers.utils import load_image
@@ -21,8 +21,6 @@ from insightface.app import FaceAnalysis
 from style_template import styles
 from pipeline_stable_diffusion_xl_instantid_full import StableDiffusionXLInstantIDPipeline, draw_kps
-# from controlnet_aux import OpenposeDetector
 import gradio as gr
 from depth_anything.dpt import DepthAnything
@@ -58,8 +56,6 @@ app = FaceAnalysis(
 )
 app.prepare(ctx_id=0, det_size=(640, 640))
-# openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
 depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(device).eval()
 transform = Compose([
@@ -85,14 +81,10 @@ controlnet_identitynet = ControlNetModel.from_pretrained(
     controlnet_path, torch_dtype=dtype
 )
-# controlnet-pose/canny/depth
-# controlnet_pose_model = "thibaud/controlnet-openpose-sdxl-1.0"
 controlnet_canny_model = "diffusers/controlnet-canny-sdxl-1.0"
 controlnet_depth_model = "diffusers/controlnet-depth-sdxl-1.0-small"
-# controlnet_pose = ControlNetModel.from_pretrained(
-#     controlnet_pose_model, torch_dtype=dtype
-# ).to(device)
 controlnet_canny = ControlNetModel.from_pretrained(
     controlnet_canny_model, torch_dtype=dtype
 ).to(device)
@@ -127,12 +119,10 @@ def get_canny_image(image, t1=100, t2=200):
     return Image.fromarray(edges, "L")
 controlnet_map = {
-    #"pose": controlnet_pose,
     "canny": controlnet_canny,
     "depth": controlnet_depth,
 }
 controlnet_map_fn = {
-    #"pose": openpose,
     "canny": get_canny_image,
     "depth": get_depth_map,
 }
@@ -180,67 +170,6 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
 def remove_tips():
     return gr.update(visible=False)
-def get_example():
-    case = [
-        [
-            "./examples/yann-lecun_resize.jpg",
-            None,
-            "a man",
-            "Spring Festival",
-            "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
-        ],
-        [
-            "./examples/musk_resize.jpeg",
-            "./examples/poses/pose2.jpg",
-            "a man flying in the sky in Mars",
-            "Mars",
-            "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
-        ],
-        [
-            "./examples/sam_resize.png",
-            "./examples/poses/pose4.jpg",
-            "a man doing a silly pose wearing a suite",
-            "Jungle",
-            "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, gree",
-        ],
-        [
-            "./examples/schmidhuber_resize.png",
-            "./examples/poses/pose3.jpg",
-            "a man sit on a chair",
-            "Neon",
-            "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
-        ],
-        [
-            "./examples/kaifu_resize.png",
-            "./examples/poses/pose.jpg",
-            "a man",
-            "Vibrant Color",
-            "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
-        ],
-    ]
-    return case
-def run_for_examples(face_file, pose_file, prompt, style, negative_prompt):
-    return generate_image(
-        face_file,
-        pose_file,
-        prompt,
-        negative_prompt,
-        style,
-        20,  # num_steps
-        0.8,  # identitynet_strength_ratio
-        0.8,  # adapter_strength_ratio
-        #0.4,  # pose_strength
-        0.3,  # canny_strength
-        0.5,  # depth_strength
-        ["depth", "canny"],  # controlnet_selection
-        5.0,  # guidance_scale
-        42,  # seed
-        "EulerDiscreteScheduler",  # scheduler
-        False,  # enable_LCM
-        True,  # enable_Face_Region
-    )
 def convert_from_cv2_to_image(img: np.ndarray) -> Image:
     return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
@@ -284,9 +213,12 @@ def apply_style(
     p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
     return p.replace("{prompt}", positive), n + " " + negative
 @spaces.GPU
 def generate_image(
-    face_image_path,
     pose_image_path,
     prompt,
     negative_prompt,
@@ -294,7 +226,6 @@ def generate_image(
     num_steps,
     identitynet_strength_ratio,
     adapter_strength_ratio,
-    #pose_strength,
     canny_strength,
     depth_strength,
     controlnet_selection,
@@ -321,9 +252,9 @@ def generate_image(
         scheduler = getattr(diffusers, scheduler_class_name)
         pipe.scheduler = scheduler.from_config(pipe.scheduler.config, **add_kwargs)
-    if face_image_path is None:
         raise gr.Error(
-            f"Cannot find any input face image! Please upload the face image"
         )
     if prompt is None:
@@ -332,28 +263,67 @@ def generate_image(
     # apply the style template
     prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)
-    face_image = load_image(face_image_path)
-    face_image = resize_img(face_image, max_side=1024)
-    face_image_cv2 = convert_from_image_to_cv2(face_image)
-    height, width, _ = face_image_cv2.shape
-    # Extract face features
-    face_info = app.get(face_image_cv2)
-    if len(face_info) == 0:
         raise gr.Error(
-            f"Unable to detect a face in the image. Please upload a different photo with a clear face."
         )
-    face_info = sorted(
-        face_info,
         key=lambda x: (x["bbox"][2] - x["bbox"][0]) * x["bbox"][3] - x["bbox"][1],
-    )[
-        -1
-    ]  # only use the maximum face
-    face_emb = face_info["embedding"]
-    face_kps = draw_kps(convert_from_cv2_to_image(face_image_cv2), face_info["kps"])
-    img_controlnet = face_image
     if pose_image_path is not None:
         pose_image = load_image(pose_image_path)
         pose_image = resize_img(pose_image, max_side=1024)
@@ -383,7 +353,6 @@ def generate_image(
     if len(controlnet_selection) > 0:
         controlnet_scales = {
-            #"pose": pose_strength,
             "canny": canny_strength,
             "depth": depth_strength,
         }
@@ -425,9 +394,42 @@ def generate_image(
     return images[0], gr.update(visible=True)
 # Description
 title = r"""
-<h1 align="center">InstantID: Zero-shot Identity-Preserving Generation in Seconds</h1>
 """
 article = r"""
@@ -449,11 +451,12 @@ If you have any questions, please feel free to open an issue or directly reach u
 """
 tips = r"""
-### Usage tips of InstantID
-1. If you're not satisfied with the similarity, try increasing the weight of "IdentityNet Strength" and "Adapter Strength."
-2. If you feel that the saturation is too high, first decrease the Adapter strength. If it remains too high, then decrease the IdentityNet strength.
-3. If you find that text control is not as expected, decrease Adapter strength.
-4. If you find that realistic style is not good enough, go for our Github repo and use a more realistic base model.
 """
 css = """
@@ -466,10 +469,19 @@ with gr.Blocks(css=css) as demo:
     with gr.Row():
         with gr.Column():
             with gr.Row(equal_height=True):
-                # upload face image
-                face_file = gr.Image(
-                    label="Upload a photo of your face", type="filepath"
                 )
             # prompt
             prompt = gr.Textbox(
@@ -514,28 +526,21 @@ with gr.Blocks(css=css) as demo:
                 )
                 controlnet_selection = gr.CheckboxGroup(
                     ["canny", "depth"], label="Controlnet", value=[],
-                    info="Use pose for skeleton inference, canny for edge detection, and depth for depth map estimation. You can try all three to control the generation process"
                 )
-                # pose_strength = gr.Slider(
-                #     label="Pose strength",
-                #     minimum=0,
-                #     maximum=1.5,
-                #     step=0.05,
-                #     value=0.40,
-                # )
                 canny_strength = gr.Slider(
                     label="Canny strength",
                     minimum=0,
                     maximum=1.5,
                     step=0.05,
-                    value=0,
                 )
                 depth_strength = gr.Slider(
                     label="Depth strength",
                     minimum=0,
                     maximum=1.5,
                     step=0.05,
-                    value=0,
                 )
             with gr.Accordion(open=False, label="Advanced Options"):
                 negative_prompt = gr.Textbox(
@@ -586,6 +591,9 @@ with gr.Blocks(css=css) as demo:
                 label="InstantID Usage Tips", value=tips, visible=False
             )
         submit.click(
             fn=remove_tips,
             outputs=usage_tips,
@@ -598,7 +606,7 @@ with gr.Blocks(css=css) as demo:
         ).then(
             fn=generate_image,
             inputs=[
-                face_file,
                 pose_file,
                 prompt,
                 negative_prompt,
@@ -606,7 +614,6 @@ with gr.Blocks(css=css) as demo:
                 num_steps,
                 identitynet_strength_ratio,
                 adapter_strength_ratio,
-                #pose_strength,
                 canny_strength,
                 depth_strength,
                 controlnet_selection,
@@ -628,7 +635,7 @@ with gr.Blocks(css=css) as demo:
     gr.Examples(
         examples=get_example(),
-        inputs=[face_file, pose_file, prompt, style, negative_prompt],
         fn=run_for_examples,
         outputs=[gallery, usage_tips],
         cache_examples=True,

 import PIL
 from PIL import Image
+from typing import Tuple, List
 import diffusers
 from diffusers.utils import load_image
 from style_template import styles
 from pipeline_stable_diffusion_xl_instantid_full import StableDiffusionXLInstantIDPipeline, draw_kps
 import gradio as gr
 from depth_anything.dpt import DepthAnything
 )
 app.prepare(ctx_id=0, det_size=(640, 640))
 depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(device).eval()
 transform = Compose([
     controlnet_path, torch_dtype=dtype
 )
+# controlnet-canny/depth
 controlnet_canny_model = "diffusers/controlnet-canny-sdxl-1.0"
 controlnet_depth_model = "diffusers/controlnet-depth-sdxl-1.0-small"
 controlnet_canny = ControlNetModel.from_pretrained(
     controlnet_canny_model, torch_dtype=dtype
 ).to(device)
     return Image.fromarray(edges, "L")
 controlnet_map = {
     "canny": controlnet_canny,
     "depth": controlnet_depth,
 }
 controlnet_map_fn = {
     "canny": get_canny_image,
     "depth": get_depth_map,
 }
 def remove_tips():
     return gr.update(visible=False)
 def convert_from_cv2_to_image(img: np.ndarray) -> Image:
     return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
     p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
     return p.replace("{prompt}", positive), n + " " + negative
+def update_face_gallery(files):
+    return gr.update(value=files, visible=True)
 @spaces.GPU
 def generate_image(
+    face_images_path,  # Now accepts a list of image paths
     pose_image_path,
     prompt,
     negative_prompt,
     num_steps,
     identitynet_strength_ratio,
     adapter_strength_ratio,
     canny_strength,
     depth_strength,
     controlnet_selection,
         scheduler = getattr(diffusers, scheduler_class_name)
         pipe.scheduler = scheduler.from_config(pipe.scheduler.config, **add_kwargs)
+    if face_images_path is None or len(face_images_path) == 0:
         raise gr.Error(
+            f"Cannot find any input face images! Please upload at least one face image"
         )
     if prompt is None:
     # apply the style template
     prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)
+    # Use the first face image for face keypoints and size reference
+    reference_face_path = face_images_path[0] if isinstance(face_images_path, list) else face_images_path
+    reference_face_image = load_image(reference_face_path)
+    reference_face_image = resize_img(reference_face_image, max_side=1024)
+    reference_face_cv2 = convert_from_image_to_cv2(reference_face_image)
+    height, width, _ = reference_face_cv2.shape
+    # Initialize a list to collect face embeddings
+    face_embeddings = []
+    # Process each face image if multiple images are provided
+    face_image_paths = face_images_path if isinstance(face_images_path, list) else [face_images_path]
+    for face_path in face_image_paths:
+        face_img = load_image(face_path)
+        face_img = resize_img(face_img, max_side=1024)
+        face_img_cv2 = convert_from_image_to_cv2(face_img)
+        # Extract face features
+        face_info = app.get(face_img_cv2)
+        if len(face_info) == 0:
+            print(f"Warning: Unable to detect a face in {face_path}. Skipping this image.")
+            continue
+        # Use the largest face in each image
+        face_info = sorted(
+            face_info,
+            key=lambda x: (x["bbox"][2] - x["bbox"][0]) * x["bbox"][3] - x["bbox"][1],
+        )[-1]
+        # Collect the embedding
+        face_embeddings.append(torch.tensor(face_info["embedding"]).unsqueeze(0))
+    if len(face_embeddings) == 0:
         raise gr.Error(
+            f"Unable to detect a face in any of the uploaded images. Please upload different photos with clear faces."
         )
+    # Average the face embeddings
+    if len(face_embeddings) == 1:
+        face_emb = face_embeddings[0].squeeze().numpy()  # Use as is if only one image
+    else:
+        # Stack and compute mean along the batch dimension
+        face_emb = torch.mean(torch.cat(face_embeddings, dim=0), dim=0).numpy()
+        print(f"Averaged {len(face_embeddings)} face embeddings")
+    # Extract keypoints from the reference face for ControlNet
+    reference_face_info = app.get(reference_face_cv2)
+    if len(reference_face_info) == 0:
+        raise gr.Error(
+            f"Unable to detect a face in the reference image for keypoints. Please upload a different photo with a clear face."
+        )
+    reference_face_info = sorted(
+        reference_face_info,
         key=lambda x: (x["bbox"][2] - x["bbox"][0]) * x["bbox"][3] - x["bbox"][1],
+    )[-1]  # Use the largest face
+    face_kps = draw_kps(convert_from_cv2_to_image(reference_face_cv2), reference_face_info["kps"])
+    img_controlnet = reference_face_image
     if pose_image_path is not None:
         pose_image = load_image(pose_image_path)
         pose_image = resize_img(pose_image, max_side=1024)
     if len(controlnet_selection) > 0:
         controlnet_scales = {
             "canny": canny_strength,
             "depth": depth_strength,
         }
     return images[0], gr.update(visible=True)
+def get_example():
+    case = [
+        [
+            "./examples/yann-lecun_resize.jpg",
+            None,
+            "a man",
+            "Spring Festival",
+            "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, photo, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green",
+        ],
+        # Add more examples as needed
+    ]
+    return case
+def run_for_examples(face_file, pose_file, prompt, style, negative_prompt):
+    return generate_image(
+        face_file,
+        pose_file,
+        prompt,
+        negative_prompt,
+        style,
+        20,  # num_steps
+        0.8,  # identitynet_strength_ratio
+        0.8,  # adapter_strength_ratio
+        0.3,  # canny_strength
+        0.5,  # depth_strength
+        ["depth", "canny"],  # controlnet_selection
+        5.0,  # guidance_scale
+        42,  # seed
+        "EulerDiscreteScheduler",  # scheduler
+        False,  # enable_LCM
+        True,  # enable_Face_Region
+    )
 # Description
 title = r"""
+<h1 align="center">InstantID: Zero-shot Identity-Preserving Generation with Multi-Face Averaging</h1>
 """
 article = r"""
 """
 tips = r"""
+### Usage tips of InstantID with Multi-Face Averaging
+1. Upload multiple photos of the same person for better identity preservation through face embedding averaging.
+2. If you're not satisfied with the similarity, try increasing the weight of "IdentityNet Strength" and "Adapter Strength."
+3. If you feel that the saturation is too high, first decrease the Adapter strength. If it remains too high, then decrease the IdentityNet strength.
+4. If you find that text control is not as expected, decrease Adapter strength.
+5. If you find that realistic style is not good enough, go for our Github repo and use a more realistic base model.
 """
 css = """
     with gr.Row():
         with gr.Column():
             with gr.Row(equal_height=True):
+                # Change from single image to multiple files
+                face_files = gr.Files(
+                    label="Upload photos of your face (1 or more)",
+                    file_types=["image"]
                 )
+            face_gallery = gr.Gallery(
+                label="Your uploaded face images",
+                visible=True,
+                columns=5,
+                rows=1,
+                height=150
+            )
             # prompt
             prompt = gr.Textbox(
                 )
                 controlnet_selection = gr.CheckboxGroup(
                     ["canny", "depth"], label="Controlnet", value=[],
+                    info="Use canny for edge detection, and depth for depth map estimation to control the generation process"
                 )
                 canny_strength = gr.Slider(
                     label="Canny strength",
                     minimum=0,
                     maximum=1.5,
                     step=0.05,
+                    value=0.3,
                 )
                 depth_strength = gr.Slider(
                     label="Depth strength",
                     minimum=0,
                     maximum=1.5,
                     step=0.05,
+                    value=0.5,
                 )
             with gr.Accordion(open=False, label="Advanced Options"):
                 negative_prompt = gr.Textbox(
                 label="InstantID Usage Tips", value=tips, visible=False
             )
+        # Connect file uploads to update the gallery
+        face_files.upload(fn=update_face_gallery, inputs=face_files, outputs=face_gallery)
         submit.click(
             fn=remove_tips,
             outputs=usage_tips,
         ).then(
             fn=generate_image,
             inputs=[
+                face_files,  # Changed from face_file to face_files
                 pose_file,
                 prompt,
                 negative_prompt,
                 num_steps,
                 identitynet_strength_ratio,
                 adapter_strength_ratio,
                 canny_strength,
                 depth_strength,
                 controlnet_selection,
     gr.Examples(
         examples=get_example(),
+        inputs=[face_files, pose_file, prompt, style, negative_prompt],
         fn=run_for_examples,
         outputs=[gallery, usage_tips],
         cache_examples=True,