Spaces:

lapa-llm
/

lapa

Running on Zero

Vladyslav Humennyy commited on Oct 2

Commit

edb7715

1 Parent(s): b7b5970

Fix vision processing by using PIL Images instead of file paths

Previously, the processor was receiving file paths instead of actual PIL Image objects, causing vision functionality to fail. This commit updates the image handling pipeline:

- Changed _ensure_image_object() to return PIL Images instead of paths
- Updated user() to store PIL Images with proper multi-modal format
- Added _prepare_processor_history() to format messages correctly for the processor
- Updated helper functions to handle new image format

Files changed (1) hide show

app.py +73 -19

app.py CHANGED Viewed

@@ -53,25 +53,23 @@ def load_model():
 model, tokenizer, processor, device = load_model()
-def _ensure_image_path(image_data: Any) -> str | None:
-    """Return a filesystem path for the provided image data."""
     if image_data is None:
         return None
-    if isinstance(image_data, str) and os.path.exists(image_data):
-        return image_data
     try:
         from PIL import Image
     except ImportError:  # pragma: no cover - PIL is bundled with Gradio's image component
         return None
     if isinstance(image_data, Image.Image):
-        fd, tmp_path = tempfile.mkstemp(suffix=".png")
-        os.close(fd)
-        image_format = image_data.format or "PNG"
-        image_data.save(tmp_path, format=image_format)
-        return tmp_path
     return None
@@ -83,13 +81,26 @@ def user(user_message, image_data, history: list):
     has_content = False
     stripped_message = user_message.strip()
-    if stripped_message:
         updated_history.append({"role": "user", "content": stripped_message})
         has_content = True
-    image_path = _ensure_image_path(image_data)
-    if image_path is not None:
-        updated_history.append({"role": "user", "content": {"path": image_path, "alt_text": "User uploaded image"}})
         has_content = True
     if not has_content:
@@ -112,7 +123,7 @@ def append_example_message(x: gr.SelectData, history):
 def _message_contains_image(message: dict[str, Any]) -> bool:
     content = message.get("content")
     if isinstance(content, dict):
-        if "path" in content:
             return True
         if content.get("type") in {"image", "image_url"}:
             return True
@@ -131,6 +142,8 @@ def _content_to_text(content: Any) -> str:
             alt_text = content.get("alt_text")
             placeholder = alt_text or os.path.basename(content["path"]) or "image"
             return f"[image: {placeholder}]"
         if content.get("type") == "image_url":
             image_url = content.get("image_url")
             if isinstance(image_url, dict):
@@ -147,8 +160,7 @@ def _content_to_text(content: Any) -> str:
                 if item_type == "text":
                     text_parts.append(item.get("text", ""))
                 elif item_type == "image":
-                    alt_text = item.get("alt_text")
-                    text_parts.append(f"[image: {alt_text}]" if alt_text else "[image]")
                 elif item_type == "image_url":
                     image_url = item.get("image_url")
                     if isinstance(image_url, dict):
@@ -188,6 +200,47 @@ def _prepare_text_history(history: list[dict[str, Any]]) -> list[dict[str, str]]
     return text_history
 @spaces.GPU
 def bot(
     history: list[dict[str, Any]]
@@ -223,8 +276,9 @@ def bot(
     if processor is not None and any(_message_contains_image(msg) for msg in history):
         try:
             model_inputs = processor(
-                messages=history,
                 return_tensors="pt",
                 add_generation_prompt=True,
             ).to(model.device)

 model, tokenizer, processor, device = load_model()
+def _ensure_image_object(image_data: Any) -> Any | None:
+    """Return a PIL Image object for the provided image data."""
     if image_data is None:
         return None
     try:
         from PIL import Image
     except ImportError:  # pragma: no cover - PIL is bundled with Gradio's image component
         return None
+    # Already a PIL Image
     if isinstance(image_data, Image.Image):
+        return image_data
+    # Load from path
+    if isinstance(image_data, str) and os.path.exists(image_data):
+        return Image.open(image_data)
     return None
     has_content = False
     stripped_message = user_message.strip()
+    image_obj = _ensure_image_object(image_data)
+    # If we have both text and image, combine them in a single message
+    if stripped_message and image_obj is not None:
+        updated_history.append({
+            "role": "user",
+            "content": [
+                {"type": "text", "text": stripped_message},
+                {"type": "image", "image": image_obj}
+            ]
+        })
+        has_content = True
+    elif stripped_message:
         updated_history.append({"role": "user", "content": stripped_message})
         has_content = True
+    elif image_obj is not None:
+        updated_history.append({
+            "role": "user",
+            "content": [{"type": "image", "image": image_obj}]
+        })
         has_content = True
     if not has_content:
 def _message_contains_image(message: dict[str, Any]) -> bool:
     content = message.get("content")
     if isinstance(content, dict):
+        if "path" in content or "image" in content:
             return True
         if content.get("type") in {"image", "image_url"}:
             return True
             alt_text = content.get("alt_text")
             placeholder = alt_text or os.path.basename(content["path"]) or "image"
             return f"[image: {placeholder}]"
+        if "image" in content:
+            return "[image]"
         if content.get("type") == "image_url":
             image_url = content.get("image_url")
             if isinstance(image_url, dict):
                 if item_type == "text":
                     text_parts.append(item.get("text", ""))
                 elif item_type == "image":
+                    text_parts.append("[image]")
                 elif item_type == "image_url":
                     image_url = item.get("image_url")
                     if isinstance(image_url, dict):
     return text_history
+def _prepare_processor_history(history: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Prepare history for processor with proper image format."""
+    processor_history = []
+    for message in history:
+        role = message.get("role", "user")
+        content = message.get("content")
+        # Handle different content formats
+        if isinstance(content, str):
+            # Simple text message
+            processor_history.append({"role": role, "content": content})
+        elif isinstance(content, list):
+            # Multi-modal content (text + images)
+            formatted_content = []
+            for item in content:
+                if isinstance(item, dict):
+                    item_type = item.get("type")
+                    if item_type == "text":
+                        formatted_content.append({"type": "text", "text": item.get("text", "")})
+                    elif item_type == "image":
+                        # Include the PIL Image directly
+                        formatted_content.append({"type": "image", "image": item.get("image")})
+            if formatted_content:
+                processor_history.append({"role": role, "content": formatted_content})
+        elif isinstance(content, dict):
+            # Legacy format or single image
+            if "image" in content:
+                processor_history.append({
+                    "role": role,
+                    "content": [{"type": "image", "image": content["image"]}]
+                })
+            else:
+                # Try to extract text
+                text = _content_to_text(content)
+                if text:
+                    processor_history.append({"role": role, "content": text})
+    return processor_history
 @spaces.GPU
 def bot(
     history: list[dict[str, Any]]
     if processor is not None and any(_message_contains_image(msg) for msg in history):
         try:
+            processor_history = _prepare_processor_history(history)
             model_inputs = processor(
+                messages=processor_history,
                 return_tensors="pt",
                 add_generation_prompt=True,
             ).to(model.device)