Spaces:

atlasia
/

AtlasOCR-demo

Runtime error

App Files Files Community

imomayiz commited on Aug 28

Commit

58c54a2

verified ·

1 Parent(s): 00a9e34

fix file not found error

Browse files

Files changed (1) hide show

app.py +264 -45

app.py CHANGED Viewed

@@ -1,55 +1,274 @@
 import gradio as gr
-import spaces
 from PIL import Image
-@spaces.GPU
 def perform_ocr(image):
-    from atlasocr_model import AtlasOCR
-    atlas_ocr=AtlasOCR()
-    output_text = atlas_ocr(image)
-    return output_text
-# Create Gradio interface
-with gr.Blocks(title="AtlasOCR") as demo:
-    gr.Markdown("# AtlasOCR")
-    gr.Markdown("Upload an image to extract Darija text in real-time. This model is specialized for Darija document OCR.")
-    with gr.Row():
-        with gr.Column(scale=1):
-            # Input image
-            image_input = gr.Image(type="numpy", label="Upload Image")
-            # Example gallery
-            gr.Examples(
-                examples=[
-                    ["i3.jpg"],
-                    ["i6.jpg"]
-                ],
-                inputs=image_input,
-                label="Example Images",
-                examples_per_page=4
-            )
-            # Submit button
-            submit_btn = gr.Button("Extract Text")
-        with gr.Column(scale=1):
-            # Output text
-            output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True)
-            # Model details
-            with gr.Accordion("Model Information", open=False):
-                gr.Markdown("""
-                **Model:** AtlasOCR-v0
-                **Description:** Darija OCR model
-                **Size:** 3B parameters
-                **Context window:** Supports up to 2000 output tokens
-                """)
-    # Set up processing flow
-    submit_btn.click(fn=perform_ocr, inputs=image_input, outputs=output)
-    image_input.change(fn=perform_ocr, inputs=image_input, outputs=output)
-demo.launch()

 import gradio as gr
+import torch
 from PIL import Image
+import logging
+from typing import Optional, Union
+import os
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class AtlasOCR:
+    def __init__(self, model_name: str = "atlasia/AtlasOCR-v0", max_tokens: int = 2000):
+        """Initialize the AtlasOCR model with proper error handling."""
+        try:
+            from unsloth import FastVisionModel
+            logger.info(f"Loading model: {model_name}")
+            self.model, self.processor = FastVisionModel.from_pretrained(
+                model_name,
+                device_map="auto",
+                load_in_4bit=True,
+                use_gradient_checkpointing="unsloth"
+            )
+            self.max_tokens = max_tokens
+            self.prompt = ""
+            logger.info("Model loaded successfully")
+        except ImportError:
+            logger.error("unsloth not found. Please install it: pip install unsloth")
+            raise
+        except Exception as e:
+            logger.error(f"Error loading model: {e}")
+            raise
+    def prepare_inputs(self, image: Image.Image) -> dict:
+        """Prepare inputs for the model with proper error handling."""
+        try:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                        },
+                        {"type": "text", "text": self.prompt},
+                    ],
+                }
+            ]
+            text = self.processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            inputs = self.processor(
+                image,
+                text,
+                add_special_tokens=False,
+                return_tensors="pt",
+            )
+            return inputs
+        except Exception as e:
+            logger.error(f"Error preparing inputs: {e}")
+            raise
+    def predict(self, image: Image.Image) -> str:
+        """Predict text from image with comprehensive error handling."""
+        try:
+            if image is None:
+                return "Please upload an image."
+            # Convert numpy array to PIL Image if needed
+            if hasattr(image, 'shape'):  # numpy array
+                image = Image.fromarray(image)
+            inputs = self.prepare_inputs(image)
+            # Move inputs to GPU if available
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
+            # Ensure attention_mask is float32
+            if 'attention_mask' in inputs:
+                inputs['attention_mask'] = inputs['attention_mask'].to(torch.float32)
+            logger.info(f"Generating text with max_tokens={self.max_tokens}")
+            with torch.no_grad():
+                generated_ids = self.model.generate(
+                    **inputs,
+                    max_new_tokens=self.max_tokens,
+                    use_cache=True,
+                    do_sample=False,
+                    temperature=0.1
+                )
+            generated_ids_trimmed = [
+                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
+            ]
+            output_text = self.processor.batch_decode(
+                generated_ids_trimmed,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False
+            )
+            result = output_text[0].strip()
+            logger.info(f"Generated text: {result[:100]}...")
+            return result
+        except Exception as e:
+            logger.error(f"Error during prediction: {e}")
+            return f"Error processing image: {str(e)}"
+    def __call__(self, image: Union[Image.Image, str]) -> str:
+        """Callable interface for the model."""
+        if isinstance(image, str):
+            return "Please upload an image file."
+        return self.predict(image)
+# Global model instance
+atlas_ocr = None
+def load_model():
+    """Load the model globally to avoid reloading."""
+    global atlas_ocr
+    if atlas_ocr is None:
+        try:
+            atlas_ocr = AtlasOCR()
+        except Exception as e:
+            logger.error(f"Failed to load model: {e}")
+            return False
+    return True
 def perform_ocr(image):
+    """Main OCR function with proper error handling."""
+    try:
+        if not load_model():
+            return "Error: Failed to load model. Please check the logs."
+        if image is None:
+            return "Please upload an image to extract text."
+        result = atlas_ocr(image)
+        return result
+    except Exception as e:
+        logger.error(f"Error in perform_ocr: {e}")
+        return f"An error occurred: {str(e)}"
+def create_interface():
+    """Create the Gradio interface with proper configuration."""
+    # Example images from assets
+    example_images = []
+    assets_dir = "assets"
+    if os.path.exists(assets_dir):
+        for file in os.listdir(assets_dir):
+            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
+                example_images.append([os.path.join(assets_dir, file)])
+    # If no example images found, use empty list
+    if not example_images:
+        example_images = []
+    with gr.Blocks(
+        title="AtlasOCR - Darija Document OCR",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container {
+            max-width: 1200px !important;
+        }
+        """
+    ) as demo:
+        gr.Markdown("""
+        # AtlasOCR - Darija Document OCR
+        Upload an image to extract Darija text in real-time. This model is specialized for Darija document OCR.
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Input image
+                image_input = gr.Image(
+                    type="pil",
+                    label="Upload Image",
+                    height=400
+                )
+                # Example gallery
+                if example_images:
+                    gr.Examples(
+                        examples=example_images,
+                        inputs=image_input,
+                        label="Example Images",
+                        examples_per_page=4
+                    )
+                # Submit button
+                submit_btn = gr.Button(
+                    "Extract Text",
+                    variant="primary",
+                    size="lg"
+                )
+                # Clear button
+                clear_btn = gr.Button("Clear", variant="secondary")
+            with gr.Column(scale=1):
+                # Output text
+                output = gr.Textbox(
+                    label="Extracted Text",
+                    lines=20,
+                    show_copy_button=True,
+                    placeholder="Extracted text will appear here..."
+                )
+                # Status indicator
+                status = gr.Textbox(
+                    label="Status",
+                    value="Ready to process images",
+                    interactive=False
+                )
+                # Model details
+                with gr.Accordion("Model Information", open=False):
+                    gr.Markdown("""
+                    **Model:** AtlasOCR-v0
+                    **Description:** Specialized Darija OCR model for Arabic dialect text extraction
+                    **Size:** 3B parameters
+                    **Context window:** Supports up to 2000 output tokens
+                    **Optimization:** 4-bit quantization for efficient inference
+                    """)
+        # Set up processing flow
+        def process_with_status(image):
+            if image is None:
+                return "Please upload an image.", "No image provided"
+            try:
+                result = perform_ocr(image)
+                return result, "Processing completed successfully"
+            except Exception as e:
+                return f"Error: {str(e)}", f"Error occurred: {str(e)}"
+        submit_btn.click(
+            fn=process_with_status,
+            inputs=image_input,
+            outputs=[output, status]
+        )
+        image_input.change(
+            fn=process_with_status,
+            inputs=image_input,
+            outputs=[output, status]
+        )
+        clear_btn.click(
+            fn=lambda: (None, "", "Ready to process images"),
+            outputs=[image_input, output, status]
+        )
+    return demo
+# Create and launch the interface
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        debug=True
+    )