riteshkokam commited on
Commit
383638b
·
verified ·
1 Parent(s): 160ecdd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -26
app.py CHANGED
@@ -1,53 +1,60 @@
1
  # app.py
2
  import gradio as gr
3
  import torch
4
- from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
5
  from gtts import gTTS
6
  import tempfile
 
7
 
8
  class AIDoctor:
9
- def __init__(self, model_name="lintw/HealthGPT-M3"):
10
  self.device = "cpu"
11
- print(f"⚙️ Using device: {self.device}")
12
- self.proc = AutoProcessor.from_pretrained(model_name, local_files_only=False, trust_remote_code=True)
13
- self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, local_files_only=False, trust_remote_code=True).to(self.device)
 
 
 
 
14
  self.stt = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
15
 
16
  def analyze(self, image, question):
17
  if image is None:
18
  return "Please upload a medical image."
19
- prompt = question or "What do you observe in this medical image?"
20
- inputs = self.proc(text=prompt, images=image, return_tensors="pt").to(self.device)
21
- outputs = self.model.generate(**inputs, max_new_tokens=200, temperature=0.7)
22
  return self.proc.decode(outputs[0], skip_special_tokens=True).strip()
23
 
24
  def tts(self, text):
25
  tts = gTTS(text=text, lang="en")
26
- path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
27
- tts.save(path)
28
- return path
29
-
30
  def respond(self, image, audio, text):
31
- question = text.strip()
32
  if audio:
33
- trans_res = self.stt(audio)
34
- q_trans = trans_res.get("text", "").strip() if isinstance(trans_res, dict) else str(trans_res)
35
- if q_trans:
36
- question = q_trans
37
- resp = self.analyze(image, question)
38
  voice = self.tts(resp)
39
- return resp, voice, question
40
 
41
  doctor = AIDoctor()
42
 
43
- with gr.Blocks() as demo:
44
- gr.Markdown("## 🏥 AI Doctor with HealthGPTM3 (CPU-optimized)")
45
- img_in = gr.Image(label="Medical Image", type="pil")
46
- aud_in = gr.Audio(label="Ask by voice", type="filepath")
47
- txt_in = gr.Textbox(label="Ask by text")
 
48
  resp_out = gr.Textbox(label="AI Response", lines=10)
49
  aud_out = gr.Audio(label="AI Speaks", type="filepath")
50
  q_out = gr.Textbox(label="Processed Question")
51
  btn = gr.Button("Ask Doctor")
52
- btn.click(fn=doctor.respond, inputs=[img_in, aud_in, txt_in], outputs=[resp_out, aud_out, q_out])
53
- demo.launch()
 
 
1
  # app.py
2
  import gradio as gr
3
  import torch
4
+ from transformers import AutoProcessor, AutoModelForVision2Seq, pipeline
5
  from gtts import gTTS
6
  import tempfile
7
+ from PIL import Image
8
 
9
  class AIDoctor:
10
+ def __init__(self, vision_model="meta-llama/Llama-3.2-11B-Vision-Instruct"):
11
  self.device = "cpu"
12
+ print(f"🖥️ Using device: {self.device}")
13
+ self.proc = AutoProcessor.from_pretrained(vision_model, trust_remote_code=True)
14
+ self.model = AutoModelForVision2Seq.from_pretrained(
15
+ vision_model,
16
+ torch_dtype=torch.float32,
17
+ trust_remote_code=True
18
+ ).to(self.device)
19
  self.stt = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
20
 
21
  def analyze(self, image, question):
22
  if image is None:
23
  return "Please upload a medical image."
24
+ prompt = question or "Please analyze this medical image for any abnormalities."
25
+ inputs = self.proc(images=image, text=prompt, return_tensors="pt").to(self.device)
26
+ outputs = self.model.generate(**inputs, max_new_tokens=256, temperature=0.7)
27
  return self.proc.decode(outputs[0], skip_special_tokens=True).strip()
28
 
29
  def tts(self, text):
30
  tts = gTTS(text=text, lang="en")
31
+ file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
32
+ tts.save(file)
33
+ return file
34
+
35
  def respond(self, image, audio, text):
36
+ q = text.strip()
37
  if audio:
38
+ result = self.stt(audio)
39
+ trans = result.get("text", "").strip() if isinstance(result, dict) else str(result)
40
+ if trans:
41
+ q = trans
42
+ resp = self.analyze(image, q)
43
  voice = self.tts(resp)
44
+ return resp, voice, q
45
 
46
  doctor = AIDoctor()
47
 
48
+ with gr.Blocks(title="🏥 AI Doctor with Llama 3.2 Vision") as demo:
49
+ gr.Markdown("## AI Doctor Vision + Voice using Llama3.2‑11B‑Vision")
50
+ with gr.Row():
51
+ img = gr.Image(label="Medical Image", type="pil")
52
+ aud_input = gr.Audio(label="Ask by voice", type="filepath")
53
+ txt_input = gr.Textbox(label="Ask by text", lines=2)
54
  resp_out = gr.Textbox(label="AI Response", lines=10)
55
  aud_out = gr.Audio(label="AI Speaks", type="filepath")
56
  q_out = gr.Textbox(label="Processed Question")
57
  btn = gr.Button("Ask Doctor")
58
+ btn.click(fn=doctor.respond, inputs=[img, aud_input, txt_input],
59
+ outputs=[resp_out, aud_out, q_out])
60
+ demo.launch()