Spaces:

TIGER-Lab
/

Mantis

Running on Zero

App Files Files Community

DongfuJiang commited on May 3, 2024

Commit

86744eb

1 Parent(s): c89a453

update

Browse files

Files changed (6) hide show

app.py +2 -2
models/conversation.py +33 -1
models/mllava/__init__.py +1 -0
models/mllava/modeling_llava.py +8 -3
models/mllava/processing_llava.py +124 -10
models/mllava/utils.py +99 -35

app.py CHANGED Viewed

@@ -5,8 +5,8 @@ import time
 from PIL import Image
 from models.mllava import MLlavaProcessor, LlavaForConditionalGeneration, chat_mllava, MLlavaForConditionalGeneration
 from typing import List
-processor = MLlavaProcessor.from_pretrained("TIGER-Lab/Mantis-llava-7b-v1.1")
-model = LlavaForConditionalGeneration.from_pretrained("TIGER-Lab/Mantis-llava-7b-v1.1")
 @spaces.GPU
 def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):

 from PIL import Image
 from models.mllava import MLlavaProcessor, LlavaForConditionalGeneration, chat_mllava, MLlavaForConditionalGeneration
 from typing import List
+processor = MLlavaProcessor.from_pretrained("TIGER-Lab/Mantis-8B-siglip-llama3")
+model = LlavaForConditionalGeneration.from_pretrained("TIGER-Lab/Mantis-8B-siglip-llama3")
 @spaces.GPU
 def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):

models/conversation.py CHANGED Viewed

@@ -10,6 +10,7 @@ class SeparatorStyle(Enum):
     MPT = auto()
     PLAIN = auto()
     LLAMA_2 = auto()
     MFuyu = auto()
@@ -30,6 +31,7 @@ class Conversation:
     def get_prompt(self):
         messages = self.messages
         if len(messages) > 0 and type(messages[0][1]) is tuple:
             messages = self.messages.copy()
             init_role, init_msg = messages[0].copy()
             init_msg = init_msg[0].replace("<image>", "").strip()
@@ -39,7 +41,6 @@ class Conversation:
                 messages.insert(1, (self.roles[1], "Received."))
             else:
                 messages[0] = (init_role, "<image>" + init_msg)
         if self.sep_style == SeparatorStyle.SINGLE:
             ret = self.system + self.sep
             for role, message in messages:
@@ -89,6 +90,15 @@ class Conversation:
                 else:
                     ret += ""
             ret = ret.lstrip(self.sep)
         elif self.sep_style == SeparatorStyle.MFuyu:
             seps = [self.sep, self.sep2]
             ret = self.system + "\n"
@@ -393,6 +403,25 @@ conv_mllava_v1_mmtag = Conversation(
     version="v1_mmtag",
 )
 default_conversation = conv_mfuyu_v1
 conv_templates = {
@@ -409,6 +438,9 @@ conv_templates = {
     "llava_v1": conv_llava_v1,
     "v1_mmtag": conv_llava_v1_mmtag,
     "llava_llama_2": conv_llava_llama_2,
     "mpt": conv_mpt,
 }

     MPT = auto()
     PLAIN = auto()
     LLAMA_2 = auto()
+    LLAMA_3 = auto()
     MFuyu = auto()
     def get_prompt(self):
         messages = self.messages
         if len(messages) > 0 and type(messages[0][1]) is tuple:
             messages = self.messages.copy()
             init_role, init_msg = messages[0].copy()
             init_msg = init_msg[0].replace("<image>", "").strip()
                 messages.insert(1, (self.roles[1], "Received."))
             else:
                 messages[0] = (init_role, "<image>" + init_msg)
         if self.sep_style == SeparatorStyle.SINGLE:
             ret = self.system + self.sep
             for role, message in messages:
                 else:
                     ret += ""
             ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n" + message + self.sep
+                else:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
         elif self.sep_style == SeparatorStyle.MFuyu:
             seps = [self.sep, self.sep2]
             ret = self.system + "\n"
     version="v1_mmtag",
 )
+conv_mllava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="</s>",
+)
+conv_llama_3 = Conversation(
+    system="<|start_header_id|>system<|end_header_id|>\n\nYou are a pirate chatbot who always responds in pirate speak!",
+    roles=("user", "assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="<|eot_id|>",
+)
 default_conversation = conv_mfuyu_v1
 conv_templates = {
     "llava_v1": conv_llava_v1,
     "v1_mmtag": conv_llava_v1_mmtag,
     "llava_llama_2": conv_llava_llama_2,
+    "llama_3": conv_llama_3,
+    "mllava_v1": conv_mllava_v1,
+    "mllava_v1_mmtag": conv_mllava_v1_mmtag,
     "mpt": conv_mpt,
 }

models/mllava/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .modeling_llava import LlavaForConditionalGeneration, MLlavaForConditionalGeneration
 from .processing_llava import MLlavaProcessor
 from .utils import chat_mllava

 from .modeling_llava import LlavaForConditionalGeneration, MLlavaForConditionalGeneration
 from .processing_llava import MLlavaProcessor
+from .configuration_llava import LlavaConfig
 from .utils import chat_mllava

models/mllava/modeling_llava.py CHANGED Viewed

@@ -249,15 +249,15 @@ LLAVA_INPUTS_DOCSTRING = r"""
     LLAVA_START_DOCSTRING,
 )
 class LlavaForConditionalGeneration(LlavaPreTrainedModel):
-    def __init__(self, config: LlavaConfig):
         super().__init__(config)
-        self.vision_tower = AutoModel.from_config(config.vision_config)
         self.multi_modal_projector = LlavaMultiModalProjector(config)
         self.vocab_size = config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(
             config.text_config, attn_implementation=config._attn_implementation
-        )
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self.post_init()
@@ -428,6 +428,11 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
             # 2. Merge text and images
             if pixel_values is not None and input_ids.shape[1] != 1:
                 image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
                 # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
                 selected_image_feature = image_outputs.hidden_states[vision_feature_layer]

     LLAVA_START_DOCSTRING,
 )
 class LlavaForConditionalGeneration(LlavaPreTrainedModel):
+    def __init__(self, config: LlavaConfig, vision_tower=None, language_model=None):
         super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config) if vision_tower is None else vision_tower
         self.multi_modal_projector = LlavaMultiModalProjector(config)
         self.vocab_size = config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(
             config.text_config, attn_implementation=config._attn_implementation
+        ) if language_model is None else language_model
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self.post_init()
             # 2. Merge text and images
             if pixel_values is not None and input_ids.shape[1] != 1:
+                if isinstance(pixel_values, list):
+                    pixel_values = torch.cat([x for x in pixel_values if x is not None], dim=0)
+                # for siglip, need to transform the pixel_values to the right data type
+                if pixel_values.dtype != self.vision_tower.dtype:
+                    pixel_values = pixel_values.type(self.vision_tower.dtype)
                 image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
                 # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
                 selected_image_feature = image_outputs.hidden_states[vision_feature_layer]

models/mllava/processing_llava.py CHANGED Viewed

@@ -16,7 +16,8 @@
 Processor class for Llava.
 """
 from typing import List, Optional, Union, Dict
 # from ...feature_extraction_utils import BatchFeature
@@ -30,6 +31,9 @@ from transformers.image_utils import ImageInput
 from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from transformers.utils import TensorType
 from PIL import Image
 import logging
@@ -52,8 +56,8 @@ class MLlavaProcessor(ProcessorMixin):
     """
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "CLIPImageProcessor"
-    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
@@ -109,7 +113,7 @@ class MLlavaProcessor(ProcessorMixin):
                         if i < num_images:
                             text[i] = t + "<image>"
                     text = "".join(text)
-                    logger.warning("Number of <image> tokens exceeds number of images. Automatically removing extra tokens at the end of the text.")
                     # raise ValueError("Invalid input text. Number of <image> tokens exceeds number of images.")
                 texts = [text]
             elif isinstance(text, list):
@@ -135,7 +139,7 @@ class MLlavaProcessor(ProcessorMixin):
                             if j < num_images:
                                 t[j] = s + "<image>"
                         t = "".join(t)
-                        logger.warning("Number of <image> tokens exceeds number of images. Automatically removing extra tokens at the end of the text.")
                         # raise ValueError("Invalid input text. Number of <image> tokens exceeds number of images.")
                     text[i] = t
                 texts = text
@@ -171,6 +175,7 @@ class MLlavaProcessor(ProcessorMixin):
         truncation: Union[bool, str, TruncationStrategy] = None,
         max_length=None,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -218,13 +223,14 @@ class MLlavaProcessor(ProcessorMixin):
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
-        texts, images = self.preprocess_interleaved_images_and_text(text, images)
         if images is not None:
             pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"] # [batch_size, num_channels, height, width], e.g. [1, 3, 336, 336]
         else:
             pixel_values = None
         text_inputs = self.tokenizer(
-            texts, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
         )
         # text_inputs:
         # 1. input_ids: [batch_size, sequence_length], e.g. [1, 6]
@@ -259,9 +265,117 @@ class MLlavaProcessor(ProcessorMixin):
         results = {}
         assert len(model_inputs) == 1, "This method only supports a single input, but get {} inputs".format(len(model_inputs))
         for k in model_inputs[0].keys():
-            if model_inputs[0][k] is not None:
-                results[k] = torch.cat([inputs[k] for inputs in model_inputs], dim=0)
             else:
-                results[k] = None
         return results

 Processor class for Llava.
 """
+import os
+import json
 from typing import List, Optional, Union, Dict
 # from ...feature_extraction_utils import BatchFeature
 from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from transformers.utils import TensorType
+from transformers.processing_utils import transformers_module
+from transformers.utils.hub import is_remote_url, download_url, cached_file, is_offline_mode
+from transformers.utils import IMAGE_PROCESSOR_NAME
 from PIL import Image
 import logging
     """
     attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("CLIPImageProcessor", "SiglipImageProcessor")
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast", "PreTrainedTokenizerFast")
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
                         if i < num_images:
                             text[i] = t + "<image>"
                     text = "".join(text)
+                    logger.warning(f"Number of <image> tokens: {num_image_tokens} exceeds number of images: {num_images}. Automatically removing extra tokens at the end of the text.")
                     # raise ValueError("Invalid input text. Number of <image> tokens exceeds number of images.")
                 texts = [text]
             elif isinstance(text, list):
                             if j < num_images:
                                 t[j] = s + "<image>"
                         t = "".join(t)
+                        logger.warning(f"Number of <image> tokens: {num_image_tokens} exceeds number of images: {num_images}. Automatically removing extra tokens at the end of the text.")
                         # raise ValueError("Invalid input text. Number of <image> tokens exceeds number of images.")
                     text[i] = t
                 texts = text
         truncation: Union[bool, str, TruncationStrategy] = None,
         max_length=None,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        add_image_ids: bool = True,
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+        if add_image_ids:
+            text, images = self.preprocess_interleaved_images_and_text(text, images)
         if images is not None:
             pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"] # [batch_size, num_channels, height, width], e.g. [1, 3, 336, 336]
         else:
             pixel_values = None
         text_inputs = self.tokenizer(
+            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
         )
         # text_inputs:
         # 1. input_ids: [batch_size, sequence_length], e.g. [1, 6]
         results = {}
         assert len(model_inputs) == 1, "This method only supports a single input, but get {} inputs".format(len(model_inputs))
         for k in model_inputs[0].keys():
+            if k == "pixel_values":
+                results[k] = [inputs[k] if inputs[k] is not None else None for inputs in model_inputs]
             else:
+                results[k] = torch.cat([inputs[k] for inputs in model_inputs], dim=0)
         return results
+    @classmethod
+    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        args = []
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", "")
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+        user_agent = {"file_type": "processor", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+        if is_offline_mode() and not local_files_only:
+            logger.info("Offline mode: forcing local_files_only=True")
+            local_files_only = True
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
+        if os.path.isfile(pretrained_model_name_or_path):
+            resolved_processor_file = pretrained_model_name_or_path
+            is_local = True
+        elif is_remote_url(pretrained_model_name_or_path):
+            processor_file = pretrained_model_name_or_path
+            resolved_processor_file = download_url(pretrained_model_name_or_path)
+        else:
+            processor_file = IMAGE_PROCESSOR_NAME
+            try:
+                # Load from local folder or from cache or download from model Hub and cache
+                resolved_processor_file = cached_file(
+                    pretrained_model_name_or_path,
+                    processor_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder,
+                    _raise_exceptions_for_missing_entries=True,
+                )
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
+                # the original exception.
+                raise
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load"
+                    " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                    f" directory containing a {IMAGE_PROCESSOR_NAME} file"
+                )
+        # Existing processors on the Hub created before #27761 being merged don't have `processor_config.json` (if not
+        # updated afterward), and we need to keep `from_pretrained` work. So here it fallbacks to the empty dict.
+        # (`cached_file` called using `_raise_exceptions_for_missing_entries=False` to avoid exception)
+        # However, for models added in the future, we won't get the expected error if this file is missing.
+        if resolved_processor_file is None:
+            image_processor_dict = {}
+        try:
+            # Load processor dict
+            with open(resolved_processor_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            image_processor_dict = json.loads(text)
+        except json.JSONDecodeError:
+            raise EnvironmentError(
+                f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file."
+            )
+        for attribute_name in cls.attributes:
+            class_name = getattr(cls, f"{attribute_name}_class")
+            if isinstance(class_name, tuple):
+                if attribute_name == "tokenizer":
+                    classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
+                    use_fast = kwargs.get("use_fast", True)
+                    if use_fast and classes[1] is not None:
+                        attribute_class = classes[1]
+                    else:
+                        attribute_class = classes[0]
+                elif attribute_name == "image_processor":
+                    image_processor_type = image_processor_dict.get("image_processor_type", None)
+                    if image_processor_type is not None:
+                        assert image_processor_type in class_name, f"Invalid image processor type: {image_processor_type}"
+                        attribute_class = getattr(transformers_module, image_processor_type)
+                    else:
+                        attribute_class = getattr(transformers_module, class_name[0])
+                else:
+                    raise ValueError(f"Invalid attribute name: {attribute_name}")
+            else:
+                attribute_class = getattr(transformers_module, class_name)
+            args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+        return args

models/mllava/utils.py CHANGED Viewed

@@ -2,7 +2,9 @@ import PIL
 import torch
 from .modeling_llava import LlavaForConditionalGeneration
 from .processing_llava import MLlavaProcessor
-from ..conversation import conv_mllava_v1_mmtag as default_conv
 from typing import List, Tuple, Union, Tuple
 def chat_mllava(
@@ -12,7 +14,6 @@ def chat_mllava(
     processor:MLlavaProcessor,
     max_input_length:int=None,
     history:List[dict]=None,
-    stream:bool=False,
     **kwargs) -> Tuple[str, List[dict]]:
     """
     Chat with the Mllava model
@@ -29,7 +30,17 @@ def chat_mllava(
     """
-    conv = default_conv.copy()
     conv.messages = []
     if history is not None:
         for message in history:
@@ -38,17 +49,8 @@ def chat_mllava(
             conv.append_message(message["role"], message["text"])
     else:
         history = []
-    if text is not None:
-        conv.append_message(conv.roles[0], text)
-        conv.append_message(conv.roles[1], "")
-        history.append({"role": conv.roles[0], "text": text})
-        history.append({"role": conv.roles[1], "text": ""})
-    else:
-        assert history, "The history should not be empty if the text is None"
-        assert history[-1]['role'] == conv.roles[1], "The last message in the history should be the assistant, an empty message"
-        assert history[-2]['text'], "The last user message in the history should not be empty"
-        assert history[-1]['text'] == "", "The last assistant message in the history should be empty"
     prompt = conv.get_prompt()
     if images:
@@ -57,27 +59,89 @@ def chat_mllava(
                 images[i] = PIL.Image.open(images[i])
     inputs = processor(images=images, text=prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
-    inputs = {k: v.to(model.device) if v is not None else v for k, v in inputs.items()}
-    if stream:
-        from transformers import TextIteratorStreamer
-        from threading import Thread
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        kwargs["streamer"] = streamer
-        inputs.update(kwargs)
-        thread = Thread(target=model.generate, kwargs=inputs)
-        thread.start()
-        for _output in streamer:
-            history[-1]["text"] += _output
-            yield history[-1]["text"], history
-    else:
-        output_ids = model.generate(**inputs, **kwargs)
-        output_ids = output_ids[0]
-        # remove the input tokens
-        generated_ids = output_ids[inputs["input_ids"].shape[-1]:]
-        generated_text = processor.decode(generated_ids, skip_special_tokens=True)
-        history[-1]["text"] = history[-1]["text"].strip()
-        return generated_text, history

 import torch
 from .modeling_llava import LlavaForConditionalGeneration
 from .processing_llava import MLlavaProcessor
+# from ..conversation import conv_mllava_v1_mmtag as default_conv
+from ..conversation import conv_mllava_v1 as default_conv, conv_templates
 from typing import List, Tuple, Union, Tuple
 def chat_mllava(
     processor:MLlavaProcessor,
     max_input_length:int=None,
     history:List[dict]=None,
     **kwargs) -> Tuple[str, List[dict]]:
     """
     Chat with the Mllava model
     """
+    if "llama-3" in model.language_model.name_or_path.lower():
+        conv = conv_templates['llama_3']
+        terminators = [
+            processor.tokenizer.eos_token_id,
+            processor.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+        ]
+    else:
+        conv = default_conv
+        terminators = None
+    kwargs["eos_token_id"] = terminators
+    conv = conv.copy()
     conv.messages = []
     if history is not None:
         for message in history:
             conv.append_message(message["role"], message["text"])
     else:
         history = []
+    conv.append_message(conv.roles[0], text)
+    conv.append_message(conv.roles[1], "")
     prompt = conv.get_prompt()
     if images:
                 images[i] = PIL.Image.open(images[i])
     inputs = processor(images=images, text=prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
+    for k, v in inputs.items():
+        if v is not None:
+            if isinstance(v, torch.Tensor):
+                inputs[k] = v.to(model.device)
+            elif isinstance(v, list):
+                inputs[k] = [x.to(model.device) for x in v]
+            else:
+                raise ValueError(f"Invalid input type: {type(v)}")
+    output_ids = model.generate(**inputs, **kwargs)
+    output_ids = output_ids[0]
+    # remove the input tokens
+    generated_ids = output_ids[inputs["input_ids"].shape[-1]:]
+    generated_text = processor.decode(generated_ids, skip_special_tokens=True)
+    history.append({"role": conv.roles[0], "text": text})
+    history.append({"role": conv.roles[1], "text": generated_text})
+    return generated_text, history
+def chat_mllava_stream(
+    text:str,
+    images: List[Union[PIL.Image.Image, str]],
+    model:LlavaForConditionalGeneration,
+    processor:MLlavaProcessor,
+    max_input_length:int=None,
+    history:List[dict]=None,
+    **kwargs) -> Tuple[str, List[dict]]:
+    """
+    Chat with the Mllava model
+    Args:
+        text: str, the text to be sent to the model, where <image> will be the placeholder for the image
+        images: List[PIL.Image.Image], the images to be sent to the model, or None
+        model: LlavaForConditionalGeneration, the model to be used
+        processor: MLlavaProcessor, the processor to be used
+        max_input_length: int, the maximum input length
+        history: List[dict], list of messages in the conversation as history. Each message is a dictionary {"role": "ASSISTANT/USER", "text": "the message"}. If None, the conversation will start from scratch
+        kwargs: dict, the generation kwargs
+    Returns:
+        Tuple[str, List[dict]], the generated text and the history of the conversation
+    """
+    conv = default_conv.copy()
+    conv.messages = []
+    if history is not None:
+        for message in history:
+            message["role"] = message["role"].upper()
+            assert message["role"] in conv.roles
+            conv.append_message(message["role"], message["text"])
+    else:
+        history = []
+    conv.append_message(conv.roles[0], text)
+    conv.append_message(conv.roles[1], "")
+    prompt = conv.get_prompt()
+    if images:
+        for i in range(len(images)):
+            if isinstance(images[i], str):
+                images[i] = PIL.Image.open(images[i])
+    inputs = processor(images=images, text=prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
+    for k, v in inputs.items():
+        if v is not None:
+            if isinstance(v, torch.Tensor):
+                inputs[k] = v.to(model.device)
+            elif isinstance(v, list):
+                inputs[k] = [x.to(model.device) for x in v]
+            else:
+                raise ValueError(f"Invalid input type: {type(v)}")
+    from transformers import TextIteratorStreamer
+    from threading import Thread
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    kwargs["streamer"] = streamer
+    inputs.update(kwargs)
+    thread = Thread(target=model.generate, kwargs=inputs)
+    thread.start()
+    history.append({"role": conv.roles[0], "text": text})
+    history.append({"role": conv.roles[1], "text": ""})
+    for _output in streamer:
+        history[-1]["text"] += _output
+        yield history[-1]["text"], history