Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

ip-adapter plus is not working with full face #2400

Open
circuluspibo opened this issue Sep 19, 2024 · 6 comments
Open

ip-adapter plus is not working with full face #2400

circuluspibo opened this issue Sep 19, 2024 · 6 comments
Assignees
Labels
PSE Escalate to PSE for further investigate support_request

Comments

@circuluspibo
Copy link

Hi. I am trying to use ip-adapter with openvino,
two model is working well (ip-adapter_sd15.bin, ip-adapter_sd15_light.bin)
but, I wish to use "ip-adapter-full-face_sd15.bin", it is not working well
some modification of example is avaiable to convert that model, but inferencing is fail

how can I use that model with sd1.5?

@eaidova
Copy link
Collaborator

eaidova commented Sep 19, 2024

@circuluspibo could you please provide more details about error and which modifications do you perform?

@circuluspibo
Copy link
Author

circuluspibo commented Sep 19, 2024

from pathlib import Path
from diffusers import AutoPipelineForText2Image
from transformers import CLIPVisionModelWithProjection
from diffusers.utils import load_image
from diffusers import LCMScheduler


stable_diffusion_id = "circulus/canvers-real-v3.9.1"
ip_adapter_id = "h94/IP-Adapter"
ip_adapter_weight_name = "ip-adapter-full-face_sd15.bin" # "ip-adapter_sd15.bin" changing other adapter model!!!!
lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
models_dir = Path("on-canvers-real-v3.9.1-ov-ip")
int8_model_path = Path("on-canvers-real-v3.9.1-ov-ip-int8")
from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig
from optimum.intel.openvino.configuration import OVQuantizationMethod

load_original_pipeline = not all(
    [
        (models_dir / model_name).exists()
        for model_name in [
            "text_encoder.xml",
            "image_encoder.xml",
            "unet.xml",
            "vae_decoder.xml",
            "vae_encoder.xml",
        ]
    ]
)


def get_pipeline_components(
    stable_diffusion_id,
    ip_adapter_id,
    ip_adapter_weight_name,
    lcm_lora_id,
    ip_adapter_scale=0.65,
):
    image_encoder = CLIPVisionModelWithProjection.from_pretrained("h94/IP-Adapter", subfolder="models/image_encoder")
    print(image_encoder)
    pipeline = AutoPipelineForText2Image.from_pretrained(stable_diffusion_id, image_encoder=image_encoder)
    pipeline.load_lora_weights(lcm_lora_id)
    pipeline.fuse_lora()
    pipeline.load_ip_adapter(ip_adapter_id, subfolder="models", weight_name=ip_adapter_weight_name)
    pipeline.set_ip_adapter_scale(ip_adapter_scale)
    scheduler = LCMScheduler.from_pretrained(stable_diffusion_id, subfolder="scheduler")
    return (
        pipeline.tokenizer,
        pipeline.feature_extractor,
        scheduler,
        pipeline.text_encoder,
        pipeline.image_encoder,
        pipeline.unet,
        pipeline.vae,
    )


if load_original_pipeline:
    (
        tokenizer,
        feature_extractor,
        scheduler,
        text_encoder,
        image_encoder,
        unet,
        vae,
    ) = get_pipeline_components(stable_diffusion_id, ip_adapter_id, ip_adapter_weight_name, lcm_lora_id)
    scheduler.save_pretrained(models_dir / "scheduler")
else:
    tokenizer, feature_extractor, scheduler, text_encoder, image_encoder, unet, vae = (
        None,
        None,
        None,
        None,
        None,
        None,
        None,
    )

import openvino as ov
import torch
import gc


def cleanup_torchscript_cache():
    """
    Helper for removing cached model representation
    """
    torch._C._jit_clear_class_registry()
    torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
    torch.jit._state._clear_class_state()

IMAGE_ENCODER_PATH = models_dir / "image_encoder.xml"
UNET_PATH = models_dir / "unet.xml"
VAE_DECODER_PATH = models_dir / "vae_decoder.xml"
VAE_ENCODER_PATH = models_dir / "vae_encoder.xml"
TEXT_ENCODER_PATH = models_dir / "text_encoder.xml"

if not IMAGE_ENCODER_PATH.exists():
    with torch.no_grad():
        ov_model = ov.convert_model(
            image_encoder,
            example_input=torch.zeros((1, 3, 224, 224)),
            input=[-1, 3, 224, 224],
        )
    ov.save_model(ov_model, IMAGE_ENCODER_PATH)
    feature_extractor.save_pretrained(models_dir / "feature_extractor")
    del ov_model
    cleanup_torchscript_cache()


if not UNET_PATH.exists():
    inputs = {
        "sample": torch.randn((2, 4, 64, 64)),
        "timestep": torch.tensor(1),
        "encoder_hidden_states": torch.randn((2, 77, 768)),
        "added_cond_kwargs": {"image_embeds": torch.ones(( 2,1280))}, #torch.ones((2, 1024) -  original emveds change for avoid failing convert
    }

    print(unet)

    with torch.no_grad():
        ov_model = ov.convert_model(unet, example_input=inputs)
    # dictionary with added_cond_kwargs will be decomposed during conversion
    # in some cases decomposition may lead to losing data type and shape information
    # We need to recover it manually after the conversion
    ov_model.inputs[-1].get_node().set_element_type(ov.Type.f32)
    ov_model.validate_nodes_and_infer_types()
    ov.save_model(ov_model, UNET_PATH)
    del ov_model
    cleanup_torchscript_cache()

if not VAE_DECODER_PATH.exists():

    class VAEDecoderWrapper(torch.nn.Module):
        def __init__(self, vae):
            super().__init__()
            self.vae = vae

        def forward(self, latents):
            return self.vae.decode(latents)

    vae_decoder = VAEDecoderWrapper(vae)
    with torch.no_grad():
        ov_model = ov.convert_model(vae_decoder, example_input=torch.ones([1, 4, 64, 64]))
    ov.save_model(ov_model, VAE_DECODER_PATH)
    del ov_model
    cleanup_torchscript_cache()
    del vae_decoder

if not VAE_ENCODER_PATH.exists():

    class VAEEncoderWrapper(torch.nn.Module):
        def __init__(self, vae):
            super().__init__()
            self.vae = vae

        def forward(self, image):
            return self.vae.encode(x=image)["latent_dist"].sample()

    vae_encoder = VAEEncoderWrapper(vae)
    vae_encoder.eval()
    image = torch.zeros((1, 3, 512, 512))
    with torch.no_grad():
        ov_model = ov.convert_model(vae_encoder, example_input=image)
    ov.save_model(ov_model, VAE_ENCODER_PATH)
    del ov_model
    cleanup_torchscript_cache()


if not TEXT_ENCODER_PATH.exists():
    with torch.no_grad():
        ov_model = ov.convert_model(
            text_encoder,
            example_input=torch.ones([1, 77], dtype=torch.long),
            input=[
                (1, 77),
            ],
        )
    ov.save_model(ov_model, TEXT_ENCODER_PATH)
    del ov_model
    cleanup_torchscript_cache()
    tokenizer.save_pretrained(models_dir / "tokenizer")

converted okay, but fail to inference. It seems to be changing image_embeds difference between original ip_adapter and ip_adapter_plus series. (1024 vs 1280)

@Iffa-Intel
Copy link

Hi @circuluspibo,

Could you clarify:

  1. Which OpenVINO Notebook demo are you using?
  2. Does the model is custom/modified or pre-trained?
  3. If custom/modified, could you share it with us for validation purposes?

@circuluspibo
Copy link
Author

@Iffa-Intel
Copy link

Iffa-Intel commented Oct 4, 2024

@circuluspibo do you have the xml file? I notice there are only bin files in that link.
Plus, which OpenVINO Notebook demo code did you use to infer the model? or are you actually using custom inferencing code?

@Iffa-Intel Iffa-Intel added the PSE Escalate to PSE for further investigate label Oct 8, 2024
@avitial avitial self-assigned this Oct 10, 2024
# for free to join this conversation on GitHub. Already have an account? # to comment
Labels
PSE Escalate to PSE for further investigate support_request
Projects
None yet
Development

No branches or pull requests

5 participants