Maga222006 commited on
Commit
c3adf17
·
1 Parent(s): b9ddbc7

MultiagentPersonalAssistant

Browse files
agent/__pycache__/file_preprocessing.cpython-312.pyc CHANGED
Binary files a/agent/__pycache__/file_preprocessing.cpython-312.pyc and b/agent/__pycache__/file_preprocessing.cpython-312.pyc differ
 
agent/file_preprocessing.py CHANGED
@@ -1,4 +1,4 @@
1
- from transformers import BlipProcessor, BlipForConditionalGeneration
2
  from speechbrain.inference.classifiers import EncoderClassifier
3
  import speech_recognition as sr
4
  from pydub import AudioSegment
@@ -14,8 +14,16 @@ import io
14
  import os
15
 
16
  load_dotenv()
17
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
18
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 
 
 
 
 
 
 
 
19
  language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp")
20
 
21
 
@@ -70,13 +78,42 @@ def preprocess_audio(file_name: str):
70
  os.remove(wav_file)
71
  return text
72
 
 
73
  def preprocess_image(file_name: str) -> str:
74
- raw_image = Image.open(file_name).convert("RGB")
75
- text = "An image of"
76
- inputs = processor(raw_image, text, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  with torch.no_grad():
78
- out = model.generate(**inputs)
79
- return processor.decode(out[0], skip_special_tokens=True)
 
 
 
 
 
 
 
80
 
81
  def preprocess_text(file_name, mime_type: str) -> str:
82
  if "pdf" in mime_type:
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
  from speechbrain.inference.classifiers import EncoderClassifier
3
  import speech_recognition as sr
4
  from pydub import AudioSegment
 
14
  import os
15
 
16
  load_dotenv()
17
+ MID = "apple/FastVLM-1.5B"
18
+ IMAGE_TOKEN_INDEX = -200
19
+
20
+ tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
21
+ model = AutoModelForCausalLM.from_pretrained(
22
+ MID,
23
+ dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
24
+ device_map="auto",
25
+ trust_remote_code=True,
26
+ )
27
  language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp")
28
 
29
 
 
78
  os.remove(wav_file)
79
  return text
80
 
81
+
82
  def preprocess_image(file_name: str) -> str:
83
+ """Send an image + instruction to FastVLM and return the model's answer."""
84
+
85
+ # Build chat with placeholder <image>
86
+ messages = [{"role": "user", "content": f"<image>\nDescribe this image in detail."}]
87
+ rendered = tok.apply_chat_template(
88
+ messages, add_generation_prompt=True, tokenize=False
89
+ )
90
+ pre, post = rendered.split("<image>", 1)
91
+
92
+ # Tokenize text around the image placeholder
93
+ pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
94
+ post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
95
+
96
+ # Insert the image token id (-200)
97
+ img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
98
+ input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
99
+ attention_mask = torch.ones_like(input_ids, device=model.device)
100
+
101
+ # Preprocess the image
102
+ img = Image.open(file_name).convert("RGB")
103
+ px = model.get_vision_tower().image_processor(images=img, return_tensors="pt")["pixel_values"]
104
+ px = px.to(model.device, dtype=model.dtype)
105
+
106
+ # Generate response
107
  with torch.no_grad():
108
+ out = model.generate(
109
+ inputs=input_ids,
110
+ attention_mask=attention_mask,
111
+ images=px,
112
+ max_new_tokens=128,
113
+ )
114
+
115
+ return tok.decode(out[0], skip_special_tokens=True)
116
+
117
 
118
  def preprocess_text(file_name, mime_type: str) -> str:
119
  if "pdf" in mime_type:
requirements.txt CHANGED
@@ -13,6 +13,8 @@ langgraph-supervisor>=0.0.29
13
  wikipedia>=1.4.0
14
  wolframalpha>=5.1.3
15
  python-multipart
 
 
16
  langmem>=0.0.29
17
  greenlet>=3.2.3
18
  deepagents>=0.0.3
@@ -22,10 +24,11 @@ SpeechRecognition>=3.14.3
22
  PyPDF2>=3.0.1
23
  asyncpg>=0.30.0
24
  pillow>=11.3.0
25
- langchain_huggingface>=0.3.0
26
  sentence-transformers>=5.0.0
27
  pyowm>=3.3.0
28
  speechbrain>=1.0.3
 
29
  geopy>=2.4.1
30
  langchain-community>=0.3.27
31
  langchain-tavily>=0.2.11
 
13
  wikipedia>=1.4.0
14
  wolframalpha>=5.1.3
15
  python-multipart
16
+ soundfile>=0.13.1
17
+ accelerate
18
  langmem>=0.0.29
19
  greenlet>=3.2.3
20
  deepagents>=0.0.3
 
24
  PyPDF2>=3.0.1
25
  asyncpg>=0.30.0
26
  pillow>=11.3.0
27
+ langchain-huggingface>=0.3.0
28
  sentence-transformers>=5.0.0
29
  pyowm>=3.3.0
30
  speechbrain>=1.0.3
31
+ langchain-core
32
  geopy>=2.4.1
33
  langchain-community>=0.3.27
34
  langchain-tavily>=0.2.11