Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Add DeepSeek-OCR Gradio application files
Browse files- README.md +34 -5
- app.py +141 -0
- requirements.txt +15 -0
    	
        README.md
    CHANGED
    
    | @@ -1,14 +1,43 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
             
            title: DeepSeek OCR Demo
         | 
| 3 | 
            -
            emoji:  | 
| 4 | 
            -
            colorFrom:  | 
| 5 | 
             
            colorTo: red
         | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
            -
            sdk_version: 5. | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
             
            license: mit
         | 
| 11 | 
            -
            short_description:  | 
| 12 | 
             
            ---
         | 
| 13 |  | 
| 14 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
             
            title: DeepSeek OCR Demo
         | 
| 3 | 
            +
            emoji: 🖼
         | 
| 4 | 
            +
            colorFrom: purple
         | 
| 5 | 
             
            colorTo: red
         | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
            +
            sdk_version: 5.44.0
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
             
            license: mit
         | 
| 11 | 
            +
            short_description: An interactive demo for the DeepSeek-OCR model.
         | 
| 12 | 
             
            ---
         | 
| 13 |  | 
| 14 | 
            +
            # DeepSeek-OCR Document Recognition
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            This Space uses the DeepSeek-OCR model for document text recognition and extraction.
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            ## Features
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            - Multiple model size options (Tiny to Large)
         | 
| 21 | 
            +
            - Free OCR and Markdown conversion
         | 
| 22 | 
            +
            - Support for various document types
         | 
| 23 | 
            +
            - Powered by ZeroGPU for efficient inference
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            ## Usage
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            1. Upload an image containing text
         | 
| 28 | 
            +
            2. Select model size (Gundam recommended for documents)
         | 
| 29 | 
            +
            3. Choose task type
         | 
| 30 | 
            +
            4. Click "Process Image"
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            ## Model Sizes
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            - **Tiny**: 512x512, fastest
         | 
| 35 | 
            +
            - **Small**: 640x640, good balance
         | 
| 36 | 
            +
            - **Base**: 1024x1024, high quality
         | 
| 37 | 
            +
            - **Large**: 1280x1280, best quality
         | 
| 38 | 
            +
            - **Gundam**: Optimized for documents with crop mode
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            ## Credits
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            Model: [deepseek-ai/DeepSeek-OCR](https://huggingface.co/deepseek-ai/DeepSeek-OCR)
         | 
| 43 | 
            +
             | 
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,141 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import gradio as gr
         | 
| 2 | 
            +
            import torch
         | 
| 3 | 
            +
            from transformers import AutoModel, AutoTokenizer
         | 
| 4 | 
            +
            import spaces
         | 
| 5 | 
            +
            import os
         | 
| 6 | 
            +
            import tempfile
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            # Load model and tokenizer
         | 
| 9 | 
            +
            model_name = "deepseek-ai/DeepSeek-OCR"
         | 
| 10 | 
            +
            tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         | 
| 11 | 
            +
            model = AutoModel.from_pretrained(
         | 
| 12 | 
            +
                model_name,
         | 
| 13 | 
            +
                _attn_implementation="flash_attention_2",
         | 
| 14 | 
            +
                trust_remote_code=True,
         | 
| 15 | 
            +
                use_safetensors=True,
         | 
| 16 | 
            +
            )
         | 
| 17 | 
            +
            model = model.eval().to(torch.bfloat16)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            @spaces.GPU
         | 
| 21 | 
            +
            def process_image(image, model_size, task_type):
         | 
| 22 | 
            +
                """
         | 
| 23 | 
            +
                Process image with DeepSeek-OCR
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                Args:
         | 
| 26 | 
            +
                    image: PIL Image or file path
         | 
| 27 | 
            +
                    model_size: Model size configuration
         | 
| 28 | 
            +
                    task_type: OCR task type
         | 
| 29 | 
            +
                """
         | 
| 30 | 
            +
                # Create temporary directory for output
         | 
| 31 | 
            +
                with tempfile.TemporaryDirectory() as output_path:
         | 
| 32 | 
            +
                    # Set prompt based on task type
         | 
| 33 | 
            +
                    if task_type == "Free OCR":
         | 
| 34 | 
            +
                        prompt = "<image>\nFree OCR. "
         | 
| 35 | 
            +
                    elif task_type == "Convert to Markdown":
         | 
| 36 | 
            +
                        prompt = "<image>\n<|grounding|>Convert the document to markdown. "
         | 
| 37 | 
            +
                    elif task_type == "Extract Text":
         | 
| 38 | 
            +
                        prompt = "<image>\nExtract all text from the image. "
         | 
| 39 | 
            +
                    else:
         | 
| 40 | 
            +
                        prompt = "<image>\nFree OCR. "
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                    # Save uploaded image temporarily
         | 
| 43 | 
            +
                    temp_image_path = os.path.join(output_path, "temp_image.jpg")
         | 
| 44 | 
            +
                    image.save(temp_image_path)
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                    # Configure model size parameters
         | 
| 47 | 
            +
                    size_configs = {
         | 
| 48 | 
            +
                        "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
         | 
| 49 | 
            +
                        "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
         | 
| 50 | 
            +
                        "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
         | 
| 51 | 
            +
                        "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
         | 
| 52 | 
            +
                        "Gundam (Recommended)": {
         | 
| 53 | 
            +
                            "base_size": 1024,
         | 
| 54 | 
            +
                            "image_size": 640,
         | 
| 55 | 
            +
                            "crop_mode": True,
         | 
| 56 | 
            +
                        },
         | 
| 57 | 
            +
                    }
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                    config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                    # Run inference
         | 
| 62 | 
            +
                    result = model.infer(
         | 
| 63 | 
            +
                        tokenizer,
         | 
| 64 | 
            +
                        prompt=prompt,
         | 
| 65 | 
            +
                        image_file=temp_image_path,
         | 
| 66 | 
            +
                        output_path=output_path,
         | 
| 67 | 
            +
                        base_size=config["base_size"],
         | 
| 68 | 
            +
                        image_size=config["image_size"],
         | 
| 69 | 
            +
                        crop_mode=config["crop_mode"],
         | 
| 70 | 
            +
                        save_results=True,
         | 
| 71 | 
            +
                        test_compress=True,
         | 
| 72 | 
            +
                    )
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                    return result
         | 
| 75 | 
            +
             | 
| 76 | 
            +
             | 
| 77 | 
            +
            # Create Gradio interface
         | 
| 78 | 
            +
            with gr.Blocks(title="DeepSeek-OCR") as demo:
         | 
| 79 | 
            +
                gr.Markdown(
         | 
| 80 | 
            +
                    """
         | 
| 81 | 
            +
                    # DeepSeek-OCR Document Recognition
         | 
| 82 | 
            +
                    
         | 
| 83 | 
            +
                    Upload an image to extract text using DeepSeek-OCR model. 
         | 
| 84 | 
            +
                    Supports various document types and handwriting recognition.
         | 
| 85 | 
            +
                    
         | 
| 86 | 
            +
                    **Model Sizes:**
         | 
| 87 | 
            +
                    - **Tiny**: Fastest, lower accuracy (512x512)
         | 
| 88 | 
            +
                    - **Small**: Fast, good accuracy (640x640)
         | 
| 89 | 
            +
                    - **Base**: Balanced performance (1024x1024)
         | 
| 90 | 
            +
                    - **Large**: Best accuracy, slower (1280x1280)
         | 
| 91 | 
            +
                    - **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode)
         | 
| 92 | 
            +
                    """
         | 
| 93 | 
            +
                )
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                with gr.Row():
         | 
| 96 | 
            +
                    with gr.Column():
         | 
| 97 | 
            +
                        image_input = gr.Image(
         | 
| 98 | 
            +
                            type="pil", label="Upload Image", sources=["upload", "clipboard"]
         | 
| 99 | 
            +
                        )
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                        model_size = gr.Dropdown(
         | 
| 102 | 
            +
                            choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
         | 
| 103 | 
            +
                            value="Gundam (Recommended)",
         | 
| 104 | 
            +
                            label="Model Size",
         | 
| 105 | 
            +
                        )
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                        task_type = gr.Dropdown(
         | 
| 108 | 
            +
                            choices=["Free OCR", "Convert to Markdown", "Extract Text"],
         | 
| 109 | 
            +
                            value="Convert to Markdown",
         | 
| 110 | 
            +
                            label="Task Type",
         | 
| 111 | 
            +
                        )
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                        submit_btn = gr.Button("Process Image", variant="primary")
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                    with gr.Column():
         | 
| 116 | 
            +
                        output_text = gr.Textbox(
         | 
| 117 | 
            +
                            label="OCR Result", lines=20, show_copy_button=True
         | 
| 118 | 
            +
                        )
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                # Examples
         | 
| 121 | 
            +
                gr.Examples(
         | 
| 122 | 
            +
                    examples=[
         | 
| 123 | 
            +
                        ["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
         | 
| 124 | 
            +
                        ["examples/receipt.jpg", "Base", "Free OCR"],
         | 
| 125 | 
            +
                    ],
         | 
| 126 | 
            +
                    inputs=[image_input, model_size, task_type],
         | 
| 127 | 
            +
                    outputs=output_text,
         | 
| 128 | 
            +
                    fn=process_image,
         | 
| 129 | 
            +
                    cache_examples=False,
         | 
| 130 | 
            +
                )
         | 
| 131 | 
            +
             | 
| 132 | 
            +
                submit_btn.click(
         | 
| 133 | 
            +
                    fn=process_image,
         | 
| 134 | 
            +
                    inputs=[image_input, model_size, task_type],
         | 
| 135 | 
            +
                    outputs=output_text,
         | 
| 136 | 
            +
                )
         | 
| 137 | 
            +
             | 
| 138 | 
            +
            # Launch the app
         | 
| 139 | 
            +
            if __name__ == "__main__":
         | 
| 140 | 
            +
                demo.queue(max_size=20)
         | 
| 141 | 
            +
                demo.launch()
         | 
    	
        requirements.txt
    ADDED
    
    | @@ -0,0 +1,15 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            torch==2.6.0
         | 
| 2 | 
            +
            transformers==4.46.3
         | 
| 3 | 
            +
            tokenizers==0.20.3
         | 
| 4 | 
            +
            einops
         | 
| 5 | 
            +
            addict 
         | 
| 6 | 
            +
            easydict
         | 
| 7 | 
            +
            gradio>=4.0.0
         | 
| 8 | 
            +
            spaces>=0.20.0
         | 
| 9 | 
            +
            Pillow>=10.0.0
         | 
| 10 | 
            +
            safetensors>=0.4.0
         | 
| 11 | 
            +
            accelerate>=0.24.0
         | 
| 12 | 
            +
            sentencepiece>=0.1.99
         | 
| 13 | 
            +
            protobuf>=3.20.0
         | 
| 14 | 
            +
            torchvision
         | 
| 15 | 
            +
            flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
         | 
 
			
