Spaces:

maria355
/

Caption-Generator-App

Sleeping

App Files Files Community

maria355 commited on Sep 12

Commit

b3d0543

verified ·

1 Parent(s): a90a526

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -313

app.py CHANGED Viewed

@@ -1,16 +1,13 @@
 import streamlit as st
 import torch
 from PIL import Image
-from transformers import Blip2Processor, Blip2ForConditionalGeneration, BlipProcessor, BlipForQuestionAnswering
 import io
 import time
-import requests
-from typing import List, Dict
-import json
 # Set page config
 st.set_page_config(
-    page_title="🚀 Advanced BLIP-2 Caption Generator",
     page_icon="🚀",
     layout="wide",
     initial_sidebar_state="expanded"
@@ -41,225 +38,67 @@ st.markdown("""
         border-radius: 5px;
         margin: 1rem 0;
     }
-    .analysis-box {
-        background-color: #f8f9fa;
-        border: 1px solid #dee2e6;
-        border-radius: 8px;
-        padding: 1rem;
-        margin: 0.5rem 0;
-    }
-    .location-box {
-        background-color: #e8f5e8;
-        border-left: 4px solid #28a745;
-        padding: 1rem;
-        border-radius: 5px;
-        margin: 1rem 0;
-    }
-    .objects-box {
-        background-color: #fff3cd;
-        border-left: 4px solid #ffc107;
-        padding: 1rem;
-        border-radius: 5px;
-        margin: 1rem 0;
-    }
 </style>
 """, unsafe_allow_html=True)
 @st.cache_resource
-def load_models():
-    """Load and cache the BLIP-2 model and BLIP VQA model"""
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Load BLIP-2 for general captioning
-        blip2_model_name = "Salesforce/blip2-opt-2.7b"
-        blip2_processor = Blip2Processor.from_pretrained(blip2_model_name)
-        blip2_model = Blip2ForConditionalGeneration.from_pretrained(
-            blip2_model_name,
             torch_dtype=torch.float16 if device == "cuda" else torch.float32,
             device_map="auto" if device == "cuda" else None
         )
-        # Load BLIP for Visual Question Answering
-        blip_model_name = "Salesforce/blip-vqa-base"
-        blip_processor = BlipProcessor.from_pretrained(blip_model_name)
-        blip_model = BlipForQuestionAnswering.from_pretrained(
-            blip_model_name,
-            torch_dtype=torch.float16 if device == "cuda" else torch.float32
-        )
         if device == "cpu":
-            blip2_model = blip2_model.to(device)
-            blip_model = blip_model.to(device)
-        return blip2_processor, blip2_model, blip_processor, blip_model, device
     except Exception as e:
-        st.error(f"Error loading models: {str(e)}")
-        return None, None, None, None, None
-def generate_basic_caption(image, processor, model, device, prompt=""):
-    """Generate basic caption for the uploaded image"""
     try:
         if prompt:
             inputs = processor(image, text=prompt, return_tensors="pt").to(device)
         else:
             inputs = processor(image, return_tensors="pt").to(device)
         with torch.no_grad():
             generated_ids = model.generate(
                 **inputs,
-                max_length=100,
                 num_beams=5,
                 temperature=0.7,
                 do_sample=True,
                 early_stopping=True
             )
         caption = processor.decode(generated_ids[0], skip_special_tokens=True)
         return caption
     except Exception as e:
         st.error(f"Error generating caption: {str(e)}")
         return None
-def ask_visual_question(image, question, processor, model, device):
-    """Ask specific questions about the image using BLIP VQA"""
-    try:
-        inputs = processor(image, question, return_tensors="pt").to(device)
-        with torch.no_grad():
-            out = model.generate(**inputs, max_length=50, num_beams=3)
-        answer = processor.decode(out[0], skip_special_tokens=True)
-        return answer
-    except Exception as e:
-        return "Unable to determine"
-def analyze_location_and_objects(image, blip_processor, blip_model, device):
-    """Analyze image for locations, landmarks, and objects"""
-    location_questions = [
-        "What country is this?",
-        "What city is this?",
-        "What landmark is this?",
-        "Where is this place?",
-        "What famous building is this?",
-        "What monument is this?",
-        "What geographical location is shown?",
-        "What tourist attraction is this?",
-        "What state or province is this?",
-        "What region is this?",
-        "What continent is this in?",
-        "What neighborhood is this?",
-        "What district is this?",
-        "What area is this?"
-    ]
-    object_questions = [
-        "What objects can you see in this image?",
-        "What are the main things in this picture?",
-        "What vehicles are in this image?",
-        "What buildings are visible?",
-        "What natural features are shown?",
-        "What people are doing in this image?",
-        "What animals are in this picture?",
-        "What food items can you see?",
-        "What clothing can you see?",
-        "What activities are happening?",
-        "What weather is shown?",
-        "What time of day is it?",
-        "What season does this appear to be?",
-        "What colors dominate this image?"
-    ]
-    architectural_questions = [
-        "What type of architecture is this?",
-        "What style of building is this?",
-        "What historical period does this represent?",
-        "What cultural elements are visible?",
-        "What materials is this building made of?",
-        "What architectural features are prominent?",
-        "What type of structure is this?",
-        "What design style is shown?"
-    ]
-    location_info = {}
-    object_info = {}
-    architectural_info = {}
-    # Analyze locations
-    for question in location_questions:
-        answer = ask_visual_question(image, question, blip_processor, blip_model, device)
-        if answer and answer.lower() not in ["no", "none", "unable to determine", "unknown", "unanswerable"]:
-            location_info[question] = answer
-    # Analyze objects
-    for question in object_questions:
-        answer = ask_visual_question(image, question, blip_processor, blip_model, device)
-        if answer and answer.lower() not in ["no", "none", "unable to determine", "unknown", "unanswerable"]:
-            object_info[question] = answer
-    # Analyze architecture
-    for question in architectural_questions:
-        answer = ask_visual_question(image, question, blip_processor, blip_model, device)
-        if answer and answer.lower() not in ["no", "none", "unable to determine", "unknown", "unanswerable"]:
-            architectural_info[question] = answer
-    return location_info, object_info, architectural_info
-def generate_enhanced_caption(basic_caption, location_info, object_info, architectural_info):
-    """Generate enhanced caption combining all analysis"""
-    enhanced_parts = [basic_caption]
-    if location_info:
-        location_details = []
-        for question, answer in location_info.items():
-            if "country" in question.lower():
-                location_details.append(f"Located in {answer}")
-            elif "city" in question.lower():
-                location_details.append(f"in {answer}")
-            elif "landmark" in question.lower() or "monument" in question.lower():
-                location_details.append(f"showing {answer}")
-            elif "building" in question.lower():
-                location_details.append(f"featuring {answer}")
-            elif "state" in question.lower() or "province" in question.lower():
-                location_details.append(f"in {answer}")
-            elif "region" in question.lower():
-                location_details.append(f"in the {answer} region")
-        if location_details:
-            enhanced_parts.append(" ".join(location_details[:3]))  # Limit to avoid too long captions
-    if architectural_info:
-        arch_details = []
-        for question, answer in architectural_info.items():
-            if "architecture" in question.lower() or "style" in question.lower():
-                arch_details.append(f"The architecture appears to be {answer}")
-            elif "period" in question.lower():
-                arch_details.append(f"from the {answer} period")
-        if arch_details:
-            enhanced_parts.append(" ".join(arch_details[:2]))
-    if object_info:
-        obj_details = []
-        for question, answer in object_info.items():
-            if "time of day" in question.lower():
-                obj_details.append(f"taken during {answer}")
-            elif "weather" in question.lower():
-                obj_details.append(f"in {answer} weather")
-            elif "season" in question.lower():
-                obj_details.append(f"during {answer}")
-        if obj_details:
-            enhanced_parts.append(" ".join(obj_details[:2]))
-    return ". ".join(enhanced_parts) + "."
 def main():
     # Header
     st.markdown("""
     <div class="main-header">
-        <h1>🚀 Advanced BLIP-2 Caption Generator</h1>
-        <p>Upload an image and get comprehensive AI analysis including locations, landmarks, and objects!</p>
     </div>
     """, unsafe_allow_html=True)
@@ -267,34 +106,23 @@ def main():
     with st.sidebar:
         st.header("🔧 Settings")
         st.markdown("### Model Information")
-        st.info("Using **BLIP-2** + **BLIP-VQA** for comprehensive analysis")
-        # Analysis options
-        st.markdown("### Analysis Options")
-        include_location = st.checkbox("🌍 Location Analysis", value=True)
-        include_objects = st.checkbox("🎯 Object Detection", value=True)
-        include_architecture = st.checkbox("🏛️ Architecture Analysis", value=True)
-        # Custom questions
-        st.markdown("### Custom Questions")
-        custom_question = st.text_input(
-            "Ask about the image:",
-            placeholder="e.g., What time of day is this?"
         )
         st.markdown("### About")
         st.markdown("""
-        This enhanced app uses multiple AI models:
         **Features:**
-        - 🖼️ Basic image captioning
-        - 🌍 Country & city recognition
-        - 🏛️ Landmark identification
-        - 🎯 Object detection
-        - 🏗️ Architecture analysis
-        - ❓ Custom Q&A
-        - 📍 State/Province detection
-        - 🌆 Neighborhood analysis
         """)
     # Main content
@@ -307,13 +135,13 @@ def main():
         uploaded_file = st.file_uploader(
             "Choose an image file",
             type=["jpg", "jpeg", "png", "bmp", "tiff"],
-            help="Upload an image for comprehensive analysis"
         )
         if uploaded_file is not None:
             # Display uploaded image
             image = Image.open(uploaded_file)
-            st.image(image, caption="Uploaded Image", use_container_width=True)
             # Image info
             st.markdown(f"""
@@ -324,131 +152,46 @@ def main():
             """)
     with col2:
-        st.markdown("### 🔮 AI Analysis Results")
         if uploaded_file is not None:
-            # Load models
-            with st.spinner("Loading AI models..."):
-                blip2_processor, blip2_model, blip_processor, blip_model, device = load_models()
-            if all([blip2_processor, blip2_model, blip_processor, blip_model]):
-                # Analyze button
-                if st.button("🚀 Analyze Image", type="primary"):
-                    with st.spinner("Performing comprehensive analysis..."):
                         start_time = time.time()
-                        # Generate basic caption
-                        basic_caption = generate_basic_caption(
-                            image, blip2_processor, blip2_model, device
-                        )
-                        # Analyze for locations and objects
-                        location_info, object_info, architectural_info = analyze_location_and_objects(
-                            image, blip_processor, blip_model, device
                         )
-                        # Custom question
-                        custom_answer = None
-                        if custom_question:
-                            custom_answer = ask_visual_question(
-                                image, custom_question, blip_processor, blip_model, device
-                            )
                         end_time = time.time()
-                        if basic_caption:
-                            # Basic Caption
                             st.markdown(f"""
                             <div class="caption-box">
-                                <h4>📝 Basic Caption:</h4>
-                                <p style="font-size: 16px; font-weight: 500;">{basic_caption}</p>
-                            </div>
-                            """, unsafe_allow_html=True)
-                            # Location Analysis
-                            if include_location and location_info:
-                                st.markdown("""
-                                <div class="location-box">
-                                    <h4>🌍 Location Analysis:</h4>
-                                </div>
-                                """, unsafe_allow_html=True)
-                                for question, answer in location_info.items():
-                                    st.write(f"**{question}** {answer}")
-                            # Object Analysis
-                            if include_objects and object_info:
-                                st.markdown("""
-                                <div class="objects-box">
-                                    <h4>🎯 Object Analysis:</h4>
-                                </div>
-                                """, unsafe_allow_html=True)
-                                for question, answer in object_info.items():
-                                    st.write(f"**{question}** {answer}")
-                            # Architecture Analysis
-                            if include_architecture and architectural_info:
-                                st.markdown("""
-                                <div class="analysis-box">
-                                    <h4>🏛️ Architecture Analysis:</h4>
-                                </div>
-                                """, unsafe_allow_html=True)
-                                for question, answer in architectural_info.items():
-                                    st.write(f"**{question}** {answer}")
-                            # Custom Question Answer
-                            if custom_answer:
-                                st.markdown(f"""
-                                <div class="analysis-box">
-                                    <h4>❓ Custom Question:</h4>
-                                    <p><strong>Q:</strong> {custom_question}</p>
-                                    <p><strong>A:</strong> {custom_answer}</p>
-                                </div>
-                                """, unsafe_allow_html=True)
-                            # Enhanced Caption
-                            enhanced_caption = generate_enhanced_caption(
-                                basic_caption, location_info, object_info, architectural_info
-                            )
-                            st.markdown(f"""
-                            <div class="caption-box" style="border-left-color: #28a745;">
-                                <h4>✨ Enhanced Caption:</h4>
-                                <p style="font-size: 16px; font-weight: 500;">{enhanced_caption}</p>
                             </div>
                             """, unsafe_allow_html=True)
                             # Performance info
-                            st.success(f"Analysis completed in {end_time - start_time:.2f} seconds")
-                            # Copy caption to clipboard
-                            st.code(enhanced_caption, language=None)
-                            # Export options
-                            analysis_data = {
-                                "basic_caption": basic_caption,
-                                "enhanced_caption": enhanced_caption,
-                                "location_info": location_info if include_location else {},
-                                "object_info": object_info if include_objects else {},
-                                "architectural_info": architectural_info if include_architecture else {},
-                                "custom_qa": {"question": custom_question, "answer": custom_answer} if custom_answer else None
-                            }
-                            st.download_button(
-                                label="📄 Download Analysis (JSON)",
-                                data=json.dumps(analysis_data, indent=2),
-                                file_name=f"image_analysis_{int(time.time())}.json",
-                                mime="application/json"
-                            )
             else:
-                st.error("Failed to load the models. Please try refreshing the page.")
         else:
             st.markdown("""
             <div class="upload-section">
                 <h3>👆 Upload an image to get started</h3>
-                <p>Get comprehensive AI analysis including locations, landmarks, and objects!</p>
                 <p>Supported formats: JPG, PNG, BMP, TIFF</p>
             </div>
             """, unsafe_allow_html=True)
@@ -457,8 +200,8 @@ def main():
     st.markdown("---")
     st.markdown("""
     <div style="text-align: center; color: #666;">
-        <p>Built with ❤️ using <strong>Streamlit</strong> and <strong>Hugging Face Transformers</strong></p>
-        <p>Powered by <strong>BLIP-2</strong> and <strong>BLIP-VQA</strong> for comprehensive image understanding</p>
     </div>
     """, unsafe_allow_html=True)

 import streamlit as st
 import torch
 from PIL import Image
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
 import io
 import time
 # Set page config
 st.set_page_config(
+    page_title="🚀 BLIP-2 Caption Generator",
     page_icon="🚀",
     layout="wide",
     initial_sidebar_state="expanded"
         border-radius: 5px;
         margin: 1rem 0;
     }
 </style>
 """, unsafe_allow_html=True)
 @st.cache_resource
+def load_model():
+    """Load and cache the BLIP-2 model and processor"""
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Use the smaller BLIP-2 model for better performance on Hugging Face Spaces
+        model_name = "Salesforce/blip2-opt-2.7b"
+        processor = Blip2Processor.from_pretrained(model_name)
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            model_name,
             torch_dtype=torch.float16 if device == "cuda" else torch.float32,
             device_map="auto" if device == "cuda" else None
         )
         if device == "cpu":
+            model = model.to(device)
+        return processor, model, device
     except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
+        return None, None, None
+def generate_caption(image, processor, model, device, prompt=""):
+    """Generate caption for the uploaded image"""
     try:
+        # Prepare inputs
         if prompt:
             inputs = processor(image, text=prompt, return_tensors="pt").to(device)
         else:
             inputs = processor(image, return_tensors="pt").to(device)
+        # Generate caption
         with torch.no_grad():
             generated_ids = model.generate(
                 **inputs,
+                max_length=50,
                 num_beams=5,
                 temperature=0.7,
                 do_sample=True,
                 early_stopping=True
             )
+        # Decode the generated caption
         caption = processor.decode(generated_ids[0], skip_special_tokens=True)
         return caption
     except Exception as e:
         st.error(f"Error generating caption: {str(e)}")
         return None
 def main():
     # Header
     st.markdown("""
     <div class="main-header">
+        <h1>🚀 BLIP-2 Caption Generator</h1>
+        <p>Upload an image and get AI-generated captions instantly!</p>
     </div>
     """, unsafe_allow_html=True)
     with st.sidebar:
         st.header("🔧 Settings")
         st.markdown("### Model Information")
+        st.info("Using **BLIP-2** (Salesforce/blip2-opt-2.7b)")
+        # Custom prompt option
+        custom_prompt = st.text_input(
+            "Custom Prompt (Optional):",
+            placeholder="e.g., 'Question: What is in this image? Answer:'"
         )
         st.markdown("### About")
         st.markdown("""
+        This app uses the **BLIP-2** model to generate natural language descriptions of images.
         **Features:**
+        - 🖼️ Upload any image format
+        - 🤖 AI-powered captioning
+        - ⚡ Fast inference
+        - 🎯 Optional custom prompts
         """)
     # Main content
         uploaded_file = st.file_uploader(
             "Choose an image file",
             type=["jpg", "jpeg", "png", "bmp", "tiff"],
+            help="Upload an image to generate a caption"
         )
         if uploaded_file is not None:
             # Display uploaded image
             image = Image.open(uploaded_file)
+            st.image(image, caption="Uploaded Image", use_column_width=True)
             # Image info
             st.markdown(f"""
             """)
     with col2:
+        st.markdown("### 🔮 Generated Caption")
         if uploaded_file is not None:
+            # Load model
+            with st.spinner("Loading BLIP-2 model..."):
+                processor, model, device = load_model()
+            if processor is not None and model is not None:
+                # Generate caption button
+                if st.button("🎯 Generate Caption", type="primary"):
+                    with st.spinner("Generating caption..."):
                         start_time = time.time()
+                        # Generate caption
+                        caption = generate_caption(
+                            image, processor, model, device, custom_prompt
                         )
                         end_time = time.time()
+                        if caption:
+                            # Display caption
                             st.markdown(f"""
                             <div class="caption-box">
+                                <h4>📝 Caption:</h4>
+                                <p style="font-size: 16px; font-weight: 500;">{caption}</p>
                             </div>
                             """, unsafe_allow_html=True)
                             # Performance info
+                            st.success(f"Caption generated in {end_time - start_time:.2f} seconds")
+                            # Copy to clipboard button
+                            st.code(caption, language=None)
             else:
+                st.error("Failed to load the model. Please try refreshing the page.")
         else:
             st.markdown("""
             <div class="upload-section">
                 <h3>👆 Upload an image to get started</h3>
                 <p>Supported formats: JPG, PNG, BMP, TIFF</p>
             </div>
             """, unsafe_allow_html=True)
     st.markdown("---")
     st.markdown("""
     <div style="text-align: center; color: #666;">
+        <p>Built with <strong>Streamlit</strong> and <strong>Hugging Face Transformers</strong></p>
+        <p>Powered by <strong>BLIP-2</strong> - Bootstrapping Language-Image Pre-training</p>
     </div>
     """, unsafe_allow_html=True)