Spaces:

nielsr
/

kosmos-2.5-demo

Running on Zero

nielsr HF Staff Claude commited on Aug 28

Commit

d463280

1 Parent(s): c25975a

Fix ZeroGPU and model loading issues

- Add accelerate dependency to requirements
- Replace deprecated torch_dtype with dtype parameter
- Implement lazy model loading to avoid ZeroGPU context issues
- Load models only when needed inside @spaces.GPU decorated functions

Fixes:
- ValueError: Using a device_map requires accelerate
- torch_dtype deprecation warnings
- ZeroGPU function called outside Gradio context warnings

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (2) hide show

app.py +45 -33
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -10,31 +10,37 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-# Initialize models and processors
-@spaces.GPU
-def load_models():
-    base_repo = "microsoft/kosmos-2.5"
-    chat_repo = "microsoft/kosmos-2.5-chat"
-    base_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
-        base_repo,
-        device_map=device,
-        torch_dtype=dtype,
-        attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
-    )
-    base_processor = AutoProcessor.from_pretrained(base_repo)
-    chat_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
-        chat_repo,
-        device_map=device,
-        torch_dtype=dtype,
-        attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
-    )
-    chat_processor = AutoProcessor.from_pretrained(chat_repo)
-    return base_model, base_processor, chat_model, chat_processor
-base_model, base_processor, chat_model, chat_processor = load_models()
 def post_process_ocr(y, scale_height, scale_width, prompt="<ocr>"):
     y = y.replace(prompt, "")
@@ -65,8 +71,10 @@ def generate_markdown(image):
     if image is None:
         return "Please upload an image."
     prompt = "<md>"
-    inputs = base_processor(text=prompt, images=image, return_tensors="pt")
     height, width = inputs.pop("height"), inputs.pop("width")
     raw_width, raw_height = image.size
@@ -77,12 +85,12 @@ def generate_markdown(image):
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
     with torch.no_grad():
-        generated_ids = base_model.generate(
             **inputs,
             max_new_tokens=1024,
         )
-    generated_text = base_processor.batch_decode(generated_ids, skip_special_tokens=True)
     result = generated_text[0].replace(prompt, "").strip()
     return result
@@ -92,8 +100,10 @@ def generate_ocr(image):
     if image is None:
         return "Please upload an image.", None
     prompt = "<ocr>"
-    inputs = base_processor(text=prompt, images=image, return_tensors="pt")
     height, width = inputs.pop("height"), inputs.pop("width")
     raw_width, raw_height = image.size
@@ -104,12 +114,12 @@ def generate_ocr(image):
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
     with torch.no_grad():
-        generated_ids = base_model.generate(
             **inputs,
             max_new_tokens=1024,
         )
-    generated_text = base_processor.batch_decode(generated_ids, skip_special_tokens=True)
     # Post-process OCR output
     output_text = post_process_ocr(generated_text[0], scale_height, scale_width)
@@ -140,10 +150,12 @@ def generate_chat_response(image, question):
     if not question.strip():
         return "Please ask a question."
     template = "<md>A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
     prompt = template.format(question)
-    inputs = chat_processor(text=prompt, images=image, return_tensors="pt")
     height, width = inputs.pop("height"), inputs.pop("width")
     raw_width, raw_height = image.size
@@ -154,12 +166,12 @@ def generate_chat_response(image, question):
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
     with torch.no_grad():
-        generated_ids = chat_model.generate(
             **inputs,
             max_new_tokens=1024,
         )
-    generated_text = chat_processor.batch_decode(generated_ids, skip_special_tokens=True)
     # Extract only the assistant's response
     result = generated_text[0]

 dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+# Initialize models and processors lazily
+base_model = None
+base_processor = None
+chat_model = None
+chat_processor = None
+def load_base_model():
+    global base_model, base_processor
+    if base_model is None:
+        base_repo = "microsoft/kosmos-2.5"
+        base_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
+            base_repo,
+            device_map=device,
+            dtype=dtype,
+            attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
+        )
+        base_processor = AutoProcessor.from_pretrained(base_repo)
+    return base_model, base_processor
+def load_chat_model():
+    global chat_model, chat_processor
+    if chat_model is None:
+        chat_repo = "microsoft/kosmos-2.5-chat"
+        chat_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
+            chat_repo,
+            device_map=device,
+            dtype=dtype,
+            attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
+        )
+        chat_processor = AutoProcessor.from_pretrained(chat_repo)
+    return chat_model, chat_processor
 def post_process_ocr(y, scale_height, scale_width, prompt="<ocr>"):
     y = y.replace(prompt, "")
     if image is None:
         return "Please upload an image."
+    model, processor = load_base_model()
     prompt = "<md>"
+    inputs = processor(text=prompt, images=image, return_tensors="pt")
     height, width = inputs.pop("height"), inputs.pop("width")
     raw_width, raw_height = image.size
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
     with torch.no_grad():
+        generated_ids = model.generate(
             **inputs,
             max_new_tokens=1024,
         )
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
     result = generated_text[0].replace(prompt, "").strip()
     return result
     if image is None:
         return "Please upload an image.", None
+    model, processor = load_base_model()
     prompt = "<ocr>"
+    inputs = processor(text=prompt, images=image, return_tensors="pt")
     height, width = inputs.pop("height"), inputs.pop("width")
     raw_width, raw_height = image.size
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
     with torch.no_grad():
+        generated_ids = model.generate(
             **inputs,
             max_new_tokens=1024,
         )
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
     # Post-process OCR output
     output_text = post_process_ocr(generated_text[0], scale_height, scale_width)
     if not question.strip():
         return "Please ask a question."
+    model, processor = load_chat_model()
     template = "<md>A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
     prompt = template.format(question)
+    inputs = processor(text=prompt, images=image, return_tensors="pt")
     height, width = inputs.pop("height"), inputs.pop("width")
     raw_width, raw_height = image.size
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
     with torch.no_grad():
+        generated_ids = model.generate(
             **inputs,
             max_new_tokens=1024,
         )
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
     # Extract only the assistant's response
     result = generated_text[0]

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 gradio==4.44.0
 torch>=2.0.0
 git+https://github.com/huggingface/transformers.git
 pillow
 requests
 spaces

 gradio==4.44.0
 torch>=2.0.0
 git+https://github.com/huggingface/transformers.git
+accelerate
 pillow
 requests
 spaces