Spaces:

nielsr
/

kosmos-2.5-demo

Running on Zero

nielsr HF Staff Claude commited on Aug 28

Commit

dce53e9

1 Parent(s): 680f872

Fix Flash Attention 2 import error with conditional loading

- Add is_flash_attention_available() helper function to detect flash-attn package
- Update both load_base_model() and load_chat_model() to conditionally use Flash Attention 2
- Fall back to default attention implementation if flash-attn is not installed
- Resolves ImportError in ZeroGPU environments without flash-attn dependency

This ensures the app works in all environments regardless of Flash Attention availability.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show

app.py +27 -6

app.py CHANGED Viewed

@@ -9,6 +9,13 @@ import re
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
 # Initialize models and processors lazily
 base_model = None
@@ -20,11 +27,18 @@ def load_base_model():
     global base_model, base_processor
     if base_model is None:
         base_repo = "microsoft/kosmos-2.5"
         base_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
             base_repo,
-            device_map="cuda",
-            dtype=dtype,
-            attn_implementation="flash_attention_2"
         )
         base_processor = AutoProcessor.from_pretrained(base_repo)
     return base_model, base_processor
@@ -33,11 +47,18 @@ def load_chat_model():
     global chat_model, chat_processor
     if chat_model is None:
         chat_repo = "microsoft/kosmos-2.5-chat"
         chat_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
             chat_repo,
-            device_map="cuda",
-            dtype=dtype,
-            attn_implementation="flash_attention_2"
         )
         chat_processor = AutoProcessor.from_pretrained(chat_repo)
     return chat_model, chat_processor

 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+# Check if Flash Attention 2 is available
+def is_flash_attention_available():
+    try:
+        import flash_attn
+        return True
+    except ImportError:
+        return False
 # Initialize models and processors lazily
 base_model = None
     global base_model, base_processor
     if base_model is None:
         base_repo = "microsoft/kosmos-2.5"
+        # Use Flash Attention 2 if available, otherwise use default attention
+        model_kwargs = {
+            "device_map": "cuda",
+            "dtype": dtype,
+        }
+        if is_flash_attention_available():
+            model_kwargs["attn_implementation"] = "flash_attention_2"
         base_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
             base_repo,
+            **model_kwargs
         )
         base_processor = AutoProcessor.from_pretrained(base_repo)
     return base_model, base_processor
     global chat_model, chat_processor
     if chat_model is None:
         chat_repo = "microsoft/kosmos-2.5-chat"
+        # Use Flash Attention 2 if available, otherwise use default attention
+        model_kwargs = {
+            "device_map": "cuda",
+            "dtype": dtype,
+        }
+        if is_flash_attention_available():
+            model_kwargs["attn_implementation"] = "flash_attention_2"
         chat_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
             chat_repo,
+            **model_kwargs
         )
         chat_processor = AutoProcessor.from_pretrained(chat_repo)
     return chat_model, chat_processor