Spaces:

maria355
/

Caption-Generator-App

Running

App Files Files Community

maria355 commited on Sep 12

Commit

b889520

verified ·

1 Parent(s): bd9009b

Update app.py

Browse files

Files changed (1) hide show

app.py +313 -50

app.py CHANGED Viewed

@@ -1,13 +1,16 @@
 import streamlit as st
 import torch
 from PIL import Image
-from transformers import Blip2Processor, Blip2ForConditionalGeneration
 import io
 import time
 # Set page config
 st.set_page_config(
-    page_title="🚀 BLIP-2 Caption Generator",
     page_icon="🚀",
     layout="wide",
     initial_sidebar_state="expanded"
@@ -38,67 +41,225 @@ st.markdown("""
         border-radius: 5px;
         margin: 1rem 0;
     }
 </style>
 """, unsafe_allow_html=True)
 @st.cache_resource
-def load_model():
-    """Load and cache the BLIP-2 model and processor"""
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Use the smaller BLIP-2 model for better performance on Hugging Face Spaces
-        model_name = "Salesforce/blip2-opt-2.7b"
-        processor = Blip2Processor.from_pretrained(model_name)
-        model = Blip2ForConditionalGeneration.from_pretrained(
-            model_name,
             torch_dtype=torch.float16 if device == "cuda" else torch.float32,
             device_map="auto" if device == "cuda" else None
         )
         if device == "cpu":
-            model = model.to(device)
-        return processor, model, device
     except Exception as e:
-        st.error(f"Error loading model: {str(e)}")
-        return None, None, None
-def generate_caption(image, processor, model, device, prompt=""):
-    """Generate caption for the uploaded image"""
     try:
-        # Prepare inputs
         if prompt:
             inputs = processor(image, text=prompt, return_tensors="pt").to(device)
         else:
             inputs = processor(image, return_tensors="pt").to(device)
-        # Generate caption
         with torch.no_grad():
             generated_ids = model.generate(
                 **inputs,
-                max_length=50,
                 num_beams=5,
                 temperature=0.7,
                 do_sample=True,
                 early_stopping=True
             )
-        # Decode the generated caption
         caption = processor.decode(generated_ids[0], skip_special_tokens=True)
         return caption
     except Exception as e:
         st.error(f"Error generating caption: {str(e)}")
         return None
 def main():
     # Header
     st.markdown("""
     <div class="main-header">
-        <h1>🚀 BLIP-2 Caption Generator</h1>
-        <p>Upload an image and get AI-generated captions instantly!</p>
     </div>
     """, unsafe_allow_html=True)
@@ -106,17 +267,34 @@ def main():
     with st.sidebar:
         st.header("🔧 Settings")
         st.markdown("### Model Information")
-        st.info("Using **BLIP-2** (Salesforce/blip2-opt-2.7b)")
         st.markdown("### About")
         st.markdown("""
-        This app uses the **BLIP-2** model to generate natural language descriptions of images.
         **Features:**
-        - 🖼️ Upload any image format
-        - 🤖 AI-powered captioning
-        - ⚡ Fast inference
-        - 🎯 Optional custom prompts
         """)
     # Main content
@@ -129,7 +307,7 @@ def main():
         uploaded_file = st.file_uploader(
             "Choose an image file",
             type=["jpg", "jpeg", "png", "bmp", "tiff"],
-            help="Upload an image to generate a caption"
         )
         if uploaded_file is not None:
@@ -146,46 +324,131 @@ def main():
             """)
     with col2:
-        st.markdown("### 🔮 Generated Caption")
         if uploaded_file is not None:
-            # Load model
-            with st.spinner("Loading BLIP-2 model..."):
-                processor, model, device = load_model()
-            if processor is not None and model is not None:
-                # Generate caption button
-                if st.button("🎯 Generate Caption", type="primary"):
-                    with st.spinner("Generating caption..."):
                         start_time = time.time()
-                        # Generate caption
-                        caption = generate_caption(
-                            image, processor, model, device
                         )
                         end_time = time.time()
-                        if caption:
-                            # Display caption
                             st.markdown(f"""
                             <div class="caption-box">
-                                <h4>📝 Caption:</h4>
-                                <p style="font-size: 16px; font-weight: 500;">{caption}</p>
                             </div>
                             """, unsafe_allow_html=True)
                             # Performance info
-                            st.success(f"Caption generated in {end_time - start_time:.2f} seconds")
-                            # Copy to clipboard button
-                            st.code(caption, language=None)
             else:
-                st.error("Failed to load the model. Please try refreshing the page.")
         else:
             st.markdown("""
             <div class="upload-section">
                 <h3>👆 Upload an image to get started</h3>
                 <p>Supported formats: JPG, PNG, BMP, TIFF</p>
             </div>
             """, unsafe_allow_html=True)
@@ -195,7 +458,7 @@ def main():
     st.markdown("""
     <div style="text-align: center; color: #666;">
         <p>Built with ❤️ using <strong>Streamlit</strong> and <strong>Hugging Face Transformers</strong></p>
-        <p>Powered by <strong>BLIP-2</strong> - Bootstrapping Language-Image Pre-training</p>
     </div>
     """, unsafe_allow_html=True)

 import streamlit as st
 import torch
 from PIL import Image
+from transformers import Blip2Processor, Blip2ForConditionalGeneration, BlipProcessor, BlipForQuestionAnswering
 import io
 import time
+import requests
+from typing import List, Dict
+import json
 # Set page config
 st.set_page_config(
+    page_title="🚀 Advanced BLIP-2 Caption Generator",
     page_icon="🚀",
     layout="wide",
     initial_sidebar_state="expanded"
         border-radius: 5px;
         margin: 1rem 0;
     }
+    .analysis-box {
+        background-color: #f8f9fa;
+        border: 1px solid #dee2e6;
+        border-radius: 8px;
+        padding: 1rem;
+        margin: 0.5rem 0;
+    }
+    .location-box {
+        background-color: #e8f5e8;
+        border-left: 4px solid #28a745;
+        padding: 1rem;
+        border-radius: 5px;
+        margin: 1rem 0;
+    }
+    .objects-box {
+        background-color: #fff3cd;
+        border-left: 4px solid #ffc107;
+        padding: 1rem;
+        border-radius: 5px;
+        margin: 1rem 0;
+    }
 </style>
 """, unsafe_allow_html=True)
 @st.cache_resource
+def load_models():
+    """Load and cache the BLIP-2 model and BLIP VQA model"""
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Load BLIP-2 for general captioning
+        blip2_model_name = "Salesforce/blip2-opt-2.7b"
+        blip2_processor = Blip2Processor.from_pretrained(blip2_model_name)
+        blip2_model = Blip2ForConditionalGeneration.from_pretrained(
+            blip2_model_name,
             torch_dtype=torch.float16 if device == "cuda" else torch.float32,
             device_map="auto" if device == "cuda" else None
         )
+        # Load BLIP for Visual Question Answering
+        blip_model_name = "Salesforce/blip-vqa-base"
+        blip_processor = BlipProcessor.from_pretrained(blip_model_name)
+        blip_model = BlipForQuestionAnswering.from_pretrained(
+            blip_model_name,
+            torch_dtype=torch.float16 if device == "cuda" else torch.float32
+        )
         if device == "cpu":
+            blip2_model = blip2_model.to(device)
+            blip_model = blip_model.to(device)
+        return blip2_processor, blip2_model, blip_processor, blip_model, device
     except Exception as e:
+        st.error(f"Error loading models: {str(e)}")
+        return None, None, None, None, None
+def generate_basic_caption(image, processor, model, device, prompt=""):
+    """Generate basic caption for the uploaded image"""
     try:
         if prompt:
             inputs = processor(image, text=prompt, return_tensors="pt").to(device)
         else:
             inputs = processor(image, return_tensors="pt").to(device)
         with torch.no_grad():
             generated_ids = model.generate(
                 **inputs,
+                max_length=100,
                 num_beams=5,
                 temperature=0.7,
                 do_sample=True,
                 early_stopping=True
             )
         caption = processor.decode(generated_ids[0], skip_special_tokens=True)
         return caption
     except Exception as e:
         st.error(f"Error generating caption: {str(e)}")
         return None
+def ask_visual_question(image, question, processor, model, device):
+    """Ask specific questions about the image using BLIP VQA"""
+    try:
+        inputs = processor(image, question, return_tensors="pt").to(device)
+        with torch.no_grad():
+            out = model.generate(**inputs, max_length=50, num_beams=3)
+        answer = processor.decode(out[0], skip_special_tokens=True)
+        return answer
+    except Exception as e:
+        return "Unable to determine"
+def analyze_location_and_objects(image, blip_processor, blip_model, device):
+    """Analyze image for locations, landmarks, and objects"""
+    location_questions = [
+        "What country is this?",
+        "What city is this?",
+        "What landmark is this?",
+        "Where is this place?",
+        "What famous building is this?",
+        "What monument is this?",
+        "What geographical location is shown?",
+        "What tourist attraction is this?",
+        "What state or province is this?",
+        "What region is this?",
+        "What continent is this in?",
+        "What neighborhood is this?",
+        "What district is this?",
+        "What area is this?"
+    ]
+    object_questions = [
+        "What objects can you see in this image?",
+        "What are the main things in this picture?",
+        "What vehicles are in this image?",
+        "What buildings are visible?",
+        "What natural features are shown?",
+        "What people are doing in this image?",
+        "What animals are in this picture?",
+        "What food items can you see?",
+        "What clothing can you see?",
+        "What activities are happening?",
+        "What weather is shown?",
+        "What time of day is it?",
+        "What season does this appear to be?",
+        "What colors dominate this image?"
+    ]
+    architectural_questions = [
+        "What type of architecture is this?",
+        "What style of building is this?",
+        "What historical period does this represent?",
+        "What cultural elements are visible?",
+        "What materials is this building made of?",
+        "What architectural features are prominent?",
+        "What type of structure is this?",
+        "What design style is shown?"
+    ]
+    location_info = {}
+    object_info = {}
+    architectural_info = {}
+    # Analyze locations
+    for question in location_questions:
+        answer = ask_visual_question(image, question, blip_processor, blip_model, device)
+        if answer and answer.lower() not in ["no", "none", "unable to determine", "unknown", "unanswerable"]:
+            location_info[question] = answer
+    # Analyze objects
+    for question in object_questions:
+        answer = ask_visual_question(image, question, blip_processor, blip_model, device)
+        if answer and answer.lower() not in ["no", "none", "unable to determine", "unknown", "unanswerable"]:
+            object_info[question] = answer
+    # Analyze architecture
+    for question in architectural_questions:
+        answer = ask_visual_question(image, question, blip_processor, blip_model, device)
+        if answer and answer.lower() not in ["no", "none", "unable to determine", "unknown", "unanswerable"]:
+            architectural_info[question] = answer
+    return location_info, object_info, architectural_info
+def generate_enhanced_caption(basic_caption, location_info, object_info, architectural_info):
+    """Generate enhanced caption combining all analysis"""
+    enhanced_parts = [basic_caption]
+    if location_info:
+        location_details = []
+        for question, answer in location_info.items():
+            if "country" in question.lower():
+                location_details.append(f"Located in {answer}")
+            elif "city" in question.lower():
+                location_details.append(f"in {answer}")
+            elif "landmark" in question.lower() or "monument" in question.lower():
+                location_details.append(f"showing {answer}")
+            elif "building" in question.lower():
+                location_details.append(f"featuring {answer}")
+            elif "state" in question.lower() or "province" in question.lower():
+                location_details.append(f"in {answer}")
+            elif "region" in question.lower():
+                location_details.append(f"in the {answer} region")
+        if location_details:
+            enhanced_parts.append(" ".join(location_details[:3]))  # Limit to avoid too long captions
+    if architectural_info:
+        arch_details = []
+        for question, answer in architectural_info.items():
+            if "architecture" in question.lower() or "style" in question.lower():
+                arch_details.append(f"The architecture appears to be {answer}")
+            elif "period" in question.lower():
+                arch_details.append(f"from the {answer} period")
+        if arch_details:
+            enhanced_parts.append(" ".join(arch_details[:2]))
+    if object_info:
+        obj_details = []
+        for question, answer in object_info.items():
+            if "time of day" in question.lower():
+                obj_details.append(f"taken during {answer}")
+            elif "weather" in question.lower():
+                obj_details.append(f"in {answer} weather")
+            elif "season" in question.lower():
+                obj_details.append(f"during {answer}")
+        if obj_details:
+            enhanced_parts.append(" ".join(obj_details[:2]))
+    return ". ".join(enhanced_parts) + "."
 def main():
     # Header
     st.markdown("""
     <div class="main-header">
+        <h1>🚀 Advanced BLIP-2 Caption Generator</h1>
+        <p>Upload an image and get comprehensive AI analysis including locations, landmarks, and objects!</p>
     </div>
     """, unsafe_allow_html=True)
     with st.sidebar:
         st.header("🔧 Settings")
         st.markdown("### Model Information")
+        st.info("Using **BLIP-2** + **BLIP-VQA** for comprehensive analysis")
+        # Analysis options
+        st.markdown("### Analysis Options")
+        include_location = st.checkbox("🌍 Location Analysis", value=True)
+        include_objects = st.checkbox("🎯 Object Detection", value=True)
+        include_architecture = st.checkbox("🏛️ Architecture Analysis", value=True)
+        # Custom questions
+        st.markdown("### Custom Questions")
+        custom_question = st.text_input(
+            "Ask about the image:",
+            placeholder="e.g., What time of day is this?"
+        )
         st.markdown("### About")
         st.markdown("""
+        This enhanced app uses multiple AI models:
         **Features:**
+        - 🖼️ Basic image captioning
+        - 🌍 Country & city recognition
+        - 🏛️ Landmark identification
+        - 🎯 Object detection
+        - 🏗️ Architecture analysis
+        - ❓ Custom Q&A
+        - 📍 State/Province detection
+        - 🌆 Neighborhood analysis
         """)
     # Main content
         uploaded_file = st.file_uploader(
             "Choose an image file",
             type=["jpg", "jpeg", "png", "bmp", "tiff"],
+            help="Upload an image for comprehensive analysis"
         )
         if uploaded_file is not None:
             """)
     with col2:
+        st.markdown("### 🔮 AI Analysis Results")
         if uploaded_file is not None:
+            # Load models
+            with st.spinner("Loading AI models..."):
+                blip2_processor, blip2_model, blip_processor, blip_model, device = load_models()
+            if all([blip2_processor, blip2_model, blip_processor, blip_model]):
+                # Analyze button
+                if st.button("🚀 Analyze Image", type="primary"):
+                    with st.spinner("Performing comprehensive analysis..."):
                         start_time = time.time()
+                        # Generate basic caption
+                        basic_caption = generate_basic_caption(
+                            image, blip2_processor, blip2_model, device
+                        )
+                        # Analyze for locations and objects
+                        location_info, object_info, architectural_info = analyze_location_and_objects(
+                            image, blip_processor, blip_model, device
                         )
+                        # Custom question
+                        custom_answer = None
+                        if custom_question:
+                            custom_answer = ask_visual_question(
+                                image, custom_question, blip_processor, blip_model, device
+                            )
                         end_time = time.time()
+                        if basic_caption:
+                            # Basic Caption
                             st.markdown(f"""
                             <div class="caption-box">
+                                <h4>📝 Basic Caption:</h4>
+                                <p style="font-size: 16px; font-weight: 500;">{basic_caption}</p>
+                            </div>
+                            """, unsafe_allow_html=True)
+                            # Location Analysis
+                            if include_location and location_info:
+                                st.markdown("""
+                                <div class="location-box">
+                                    <h4>🌍 Location Analysis:</h4>
+                                </div>
+                                """, unsafe_allow_html=True)
+                                for question, answer in location_info.items():
+                                    st.write(f"**{question}** {answer}")
+                            # Object Analysis
+                            if include_objects and object_info:
+                                st.markdown("""
+                                <div class="objects-box">
+                                    <h4>🎯 Object Analysis:</h4>
+                                </div>
+                                """, unsafe_allow_html=True)
+                                for question, answer in object_info.items():
+                                    st.write(f"**{question}** {answer}")
+                            # Architecture Analysis
+                            if include_architecture and architectural_info:
+                                st.markdown("""
+                                <div class="analysis-box">
+                                    <h4>🏛️ Architecture Analysis:</h4>
+                                </div>
+                                """, unsafe_allow_html=True)
+                                for question, answer in architectural_info.items():
+                                    st.write(f"**{question}** {answer}")
+                            # Custom Question Answer
+                            if custom_answer:
+                                st.markdown(f"""
+                                <div class="analysis-box">
+                                    <h4>❓ Custom Question:</h4>
+                                    <p><strong>Q:</strong> {custom_question}</p>
+                                    <p><strong>A:</strong> {custom_answer}</p>
+                                </div>
+                                """, unsafe_allow_html=True)
+                            # Enhanced Caption
+                            enhanced_caption = generate_enhanced_caption(
+                                basic_caption, location_info, object_info, architectural_info
+                            )
+                            st.markdown(f"""
+                            <div class="caption-box" style="border-left-color: #28a745;">
+                                <h4>✨ Enhanced Caption:</h4>
+                                <p style="font-size: 16px; font-weight: 500;">{enhanced_caption}</p>
                             </div>
                             """, unsafe_allow_html=True)
                             # Performance info
+                            st.success(f"Analysis completed in {end_time - start_time:.2f} seconds")
+                            # Copy caption to clipboard
+                            st.code(enhanced_caption, language=None)
+                            # Export options
+                            analysis_data = {
+                                "basic_caption": basic_caption,
+                                "enhanced_caption": enhanced_caption,
+                                "location_info": location_info if include_location else {},
+                                "object_info": object_info if include_objects else {},
+                                "architectural_info": architectural_info if include_architecture else {},
+                                "custom_qa": {"question": custom_question, "answer": custom_answer} if custom_answer else None
+                            }
+                            st.download_button(
+                                label="📄 Download Analysis (JSON)",
+                                data=json.dumps(analysis_data, indent=2),
+                                file_name=f"image_analysis_{int(time.time())}.json",
+                                mime="application/json"
+                            )
             else:
+                st.error("Failed to load the models. Please try refreshing the page.")
         else:
             st.markdown("""
             <div class="upload-section">
                 <h3>👆 Upload an image to get started</h3>
+                <p>Get comprehensive AI analysis including locations, landmarks, and objects!</p>
                 <p>Supported formats: JPG, PNG, BMP, TIFF</p>
             </div>
             """, unsafe_allow_html=True)
     st.markdown("""
     <div style="text-align: center; color: #666;">
         <p>Built with ❤️ using <strong>Streamlit</strong> and <strong>Hugging Face Transformers</strong></p>
+        <p>Powered by <strong>BLIP-2</strong> and <strong>BLIP-VQA</strong> for comprehensive image understanding</p>
     </div>
     """, unsafe_allow_html=True)