Spaces:

Maga222006
/

MultiagentPersonalAssistant

Sleeping

App Files Files Community

Maga222006 commited on Sep 7

Commit

c3adf17

1 Parent(s): b9ddbc7

MultiagentPersonalAssistant

Browse files

Files changed (3) hide show

agent/__pycache__/file_preprocessing.cpython-312.pyc +0 -0
agent/file_preprocessing.py +45 -8
requirements.txt +4 -1

agent/__pycache__/file_preprocessing.cpython-312.pyc CHANGED Viewed

Binary files a/agent/__pycache__/file_preprocessing.cpython-312.pyc and b/agent/__pycache__/file_preprocessing.cpython-312.pyc differ

agent/file_preprocessing.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from transformers import BlipProcessor, BlipForConditionalGeneration
 from speechbrain.inference.classifiers import EncoderClassifier
 import speech_recognition as sr
 from pydub import AudioSegment
@@ -14,8 +14,16 @@ import io
 import os
 load_dotenv()
-processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp")
@@ -70,13 +78,42 @@ def preprocess_audio(file_name: str):
     os.remove(wav_file)
     return text
 def preprocess_image(file_name: str) -> str:
-    raw_image = Image.open(file_name).convert("RGB")
-    text = "An image of"
-    inputs = processor(raw_image, text, return_tensors="pt")
     with torch.no_grad():
-        out = model.generate(**inputs)
-    return processor.decode(out[0], skip_special_tokens=True)
 def preprocess_text(file_name, mime_type: str) -> str:
     if "pdf" in mime_type:

+from transformers import AutoTokenizer, AutoModelForCausalLM
 from speechbrain.inference.classifiers import EncoderClassifier
 import speech_recognition as sr
 from pydub import AudioSegment
 import os
 load_dotenv()
+MID = "apple/FastVLM-1.5B"
+IMAGE_TOKEN_INDEX = -200
+tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MID,
+    dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto",
+    trust_remote_code=True,
+)
 language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp")
     os.remove(wav_file)
     return text
 def preprocess_image(file_name: str) -> str:
+    """Send an image + instruction to FastVLM and return the model's answer."""
+    # Build chat with placeholder <image>
+    messages = [{"role": "user", "content": f"<image>\nDescribe this image in detail."}]
+    rendered = tok.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False
+    )
+    pre, post = rendered.split("<image>", 1)
+    # Tokenize text around the image placeholder
+    pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
+    post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
+    # Insert the image token id (-200)
+    img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
+    input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
+    attention_mask = torch.ones_like(input_ids, device=model.device)
+    # Preprocess the image
+    img = Image.open(file_name).convert("RGB")
+    px = model.get_vision_tower().image_processor(images=img, return_tensors="pt")["pixel_values"]
+    px = px.to(model.device, dtype=model.dtype)
+    # Generate response
     with torch.no_grad():
+        out = model.generate(
+            inputs=input_ids,
+            attention_mask=attention_mask,
+            images=px,
+            max_new_tokens=128,
+        )
+    return tok.decode(out[0], skip_special_tokens=True)
 def preprocess_text(file_name, mime_type: str) -> str:
     if "pdf" in mime_type:

requirements.txt CHANGED Viewed

@@ -13,6 +13,8 @@ langgraph-supervisor>=0.0.29
 wikipedia>=1.4.0
 wolframalpha>=5.1.3
 python-multipart
 langmem>=0.0.29
 greenlet>=3.2.3
 deepagents>=0.0.3
@@ -22,10 +24,11 @@ SpeechRecognition>=3.14.3
 PyPDF2>=3.0.1
 asyncpg>=0.30.0
 pillow>=11.3.0
-langchain_huggingface>=0.3.0
 sentence-transformers>=5.0.0
 pyowm>=3.3.0
 speechbrain>=1.0.3
 geopy>=2.4.1
 langchain-community>=0.3.27
 langchain-tavily>=0.2.11

 wikipedia>=1.4.0
 wolframalpha>=5.1.3
 python-multipart
+soundfile>=0.13.1
+accelerate
 langmem>=0.0.29
 greenlet>=3.2.3
 deepagents>=0.0.3
 PyPDF2>=3.0.1
 asyncpg>=0.30.0
 pillow>=11.3.0
+langchain-huggingface>=0.3.0
 sentence-transformers>=5.0.0
 pyowm>=3.3.0
 speechbrain>=1.0.3
+langchain-core
 geopy>=2.4.1
 langchain-community>=0.3.27
 langchain-tavily>=0.2.11