riteshkokam commited on
Commit
ab6adaa
Β·
verified Β·
1 Parent(s): bf7dabd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -33
app.py CHANGED
@@ -1,60 +1,79 @@
1
- # app.py
2
  import gradio as gr
3
  import torch
4
- from transformers import AutoProcessor, AutoModelForVision2Seq, pipeline
 
5
  from gtts import gTTS
6
  import tempfile
7
  from PIL import Image
8
 
 
 
 
 
 
9
  class AIDoctor:
10
- def __init__(self, vision_model="meta-llama/Llama-3.2-11B-Vision"):
11
  self.device = "cpu"
12
- print(f"πŸ–₯️ Using device: {self.device}")
13
- self.proc = AutoProcessor.from_pretrained(vision_model, trust_remote_code=True)
14
- self.model = AutoModelForVision2Seq.from_pretrained(
15
- vision_model,
 
 
16
  torch_dtype=torch.float32,
17
  trust_remote_code=True
18
  ).to(self.device)
 
 
19
  self.stt = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
20
-
21
  def analyze(self, image, question):
22
  if image is None:
23
- return "Please upload a medical image."
24
- prompt = question or "Please analyze this medical image for any abnormalities."
25
- inputs = self.proc(images=image, text=prompt, return_tensors="pt").to(self.device)
26
- outputs = self.model.generate(**inputs, max_new_tokens=256, temperature=0.7)
27
- return self.proc.decode(outputs[0], skip_special_tokens=True).strip()
28
-
 
 
 
29
  def tts(self, text):
30
  tts = gTTS(text=text, lang="en")
31
- file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
32
- tts.save(file)
33
- return file
34
-
35
  def respond(self, image, audio, text):
36
- q = text.strip()
37
  if audio:
38
- result = self.stt(audio)
39
- trans = result.get("text", "").strip() if isinstance(result, dict) else str(result)
40
- if trans:
41
- q = trans
42
- resp = self.analyze(image, q)
43
  voice = self.tts(resp)
44
- return resp, voice, q
45
 
 
46
  doctor = AIDoctor()
47
 
48
- with gr.Blocks(title="πŸ₯ AI Doctor with Llama 3.2 Vision") as demo:
49
- gr.Markdown("## AI Doctor β€” Vision + Voice using Llama‑3.2‑11B‑Vision")
 
 
50
  with gr.Row():
51
- img = gr.Image(label="Medical Image", type="pil")
52
- aud_input = gr.Audio(label="Ask by voice", type="filepath")
53
- txt_input = gr.Textbox(label="Ask by text", lines=2)
 
54
  resp_out = gr.Textbox(label="AI Response", lines=10)
55
  aud_out = gr.Audio(label="AI Speaks", type="filepath")
56
- q_out = gr.Textbox(label="Processed Question")
57
- btn = gr.Button("Ask Doctor")
58
- btn.click(fn=doctor.respond, inputs=[img, aud_input, txt_input],
 
 
59
  outputs=[resp_out, aud_out, q_out])
60
  demo.launch()
 
1
+ import os
2
  import gradio as gr
3
  import torch
4
+ from transformers import AutoProcessor, MllamaForConditionalGeneration, pipeline
5
+ from huggingface_hub import login
6
  from gtts import gTTS
7
  import tempfile
8
  from PIL import Image
9
 
10
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
11
+ # πŸ’‘ STEP 0: AUTHENTICATE WITH HF
12
+ login(token=os.getenv("HUGGINGFACE_TOKEN")) # Or paste your token: "hf_xxx"
13
+
14
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
15
  class AIDoctor:
16
+ def __init__(self, model_id="meta-llama/Llama-3.2-11B-Vision-Instruct"):
17
  self.device = "cpu"
18
+ print(f"πŸ”§ Running on device: {self.device}")
19
+
20
+ # Load vision+language model with gated access
21
+ self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
22
+ self.model = MllamaForConditionalGeneration.from_pretrained(
23
+ model_id,
24
  torch_dtype=torch.float32,
25
  trust_remote_code=True
26
  ).to(self.device)
27
+
28
+ # Speech-to-text
29
  self.stt = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
30
+
31
  def analyze(self, image, question):
32
  if image is None:
33
+ return "❗ Please upload a medical image."
34
+ prompt = question.strip() or "Analyze this medical image and share any abnormalities."
35
+
36
+ inputs = self.processor(
37
+ images=image, text=prompt, return_tensors="pt"
38
+ ).to(self.device)
39
+ outputs = self.model.generate(**inputs, max_new_tokens=200, temperature=0.7)
40
+ return self.processor.decode(outputs[0], skip_special_tokens=True).strip()
41
+
42
  def tts(self, text):
43
  tts = gTTS(text=text, lang="en")
44
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
45
+ tts.save(tmp)
46
+ return tmp
47
+
48
  def respond(self, image, audio, text):
49
+ question = text.strip()
50
  if audio:
51
+ res = self.stt(audio)
52
+ q = res.get("text", "").strip() if isinstance(res, dict) else str(res).strip()
53
+ if q:
54
+ question = q
55
+ resp = self.analyze(image, question)
56
  voice = self.tts(resp)
57
+ return resp, voice, question
58
 
59
+ # Initialize model
60
  doctor = AIDoctor()
61
 
62
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
63
+ with gr.Blocks(title="πŸ₯ AI Doctor with Llamaβ€―3.2 Vision") as demo:
64
+ gr.Markdown("## AI Doctor β€” Vision + Voice powered by Llamaβ€―3.2‑Vision‑Instruct")
65
+
66
  with gr.Row():
67
+ img_in = gr.Image(label="Upload Medical Image", type="pil")
68
+ aud_in = gr.Audio(label="Ask by Voice", type="filepath")
69
+ txt_in = gr.Textbox(label="Ask by Text", lines=2)
70
+
71
  resp_out = gr.Textbox(label="AI Response", lines=10)
72
  aud_out = gr.Audio(label="AI Speaks", type="filepath")
73
+ q_out = gr.Textbox(label="Processed Question", lines=1)
74
+
75
+ btn = gr.Button("Ask AI Doctor")
76
+ btn.click(fn=doctor.respond,
77
+ inputs=[img_in, aud_in, txt_in],
78
  outputs=[resp_out, aud_out, q_out])
79
  demo.launch()