Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from fastapi.responses import RedirectResponse, JSONResponse, FileResponse | |
| import os | |
| from PIL import Image | |
| from transformers import ViltProcessor, ViltForQuestionAnswering, pipeline | |
| from gtts import gTTS | |
| import easyocr | |
| import torch | |
| import tempfile | |
| import numpy as np | |
| from io import BytesIO | |
| app = FastAPI() | |
| vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") | |
| vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") | |
| captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") | |
| reader = easyocr.Reader(['en', 'fr']) | |
| def classify_question(question: str): | |
| q = question.lower() | |
| if any(w in q for w in ["text", "say", "written", "read"]): | |
| return "ocr" | |
| if any(w in q for w in ["caption", "describe", "what is in the image"]): | |
| return "caption" | |
| return "vqa" | |
| def answer_question_from_image(image, question): | |
| if image is None or not question.strip(): | |
| return "Please upload an image and ask a question.", None | |
| mode = classify_question(question) | |
| try: | |
| if mode == "ocr": | |
| result = reader.readtext(np.array(image)) | |
| answer = " ".join([entry[1] for entry in result]) or "No readable text found." | |
| elif mode == "caption": | |
| answer = captioner(image)[0]['generated_text'] | |
| else: | |
| inputs = vqa_processor(image, question, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = vqa_model(**inputs) | |
| predicted_id = outputs.logits.argmax(-1).item() | |
| answer = vqa_model.config.id2label[predicted_id] | |
| tts = gTTS(text=answer) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: | |
| tts.save(tmp.name) | |
| return answer, tmp.name | |
| except Exception as e: | |
| return f"Error: {e}", None | |
| def home(): | |
| return RedirectResponse(url="/templates/home.html") |