Delete .ipynb_checkpoints/README-checkpoint.md (Duplicate, Junk)
#10
by
						
qpqpqpqpqpqp
	
							
						- opened
							
					
    	
        .ipynb_checkpoints/README-checkpoint.md
    DELETED
    
    | 
         @@ -1,122 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            ---
         
     | 
| 2 | 
         
            -
            pipeline_tag: image-text-to-text
         
     | 
| 3 | 
         
            -
            language:
         
     | 
| 4 | 
         
            -
            - multilingual
         
     | 
| 5 | 
         
            -
            tags:
         
     | 
| 6 | 
         
            -
            - deepseek
         
     | 
| 7 | 
         
            -
            - vision-language
         
     | 
| 8 | 
         
            -
            - ocr
         
     | 
| 9 | 
         
            -
            - custom_code
         
     | 
| 10 | 
         
            -
            license: mit
         
     | 
| 11 | 
         
            -
            ---
         
     | 
| 12 | 
         
            -
            <div align="center">
         
     | 
| 13 | 
         
            -
              <img src="https://github.com/deepseek-ai/DeepSeek-V2/blob/main/figures/logo.svg?raw=true" width="60%" alt="DeepSeek AI" />
         
     | 
| 14 | 
         
            -
            </div>
         
     | 
| 15 | 
         
            -
            <hr>
         
     | 
| 16 | 
         
            -
            <div align="center">
         
     | 
| 17 | 
         
            -
              <a href="https://www.deepseek.com/" target="_blank">
         
     | 
| 18 | 
         
            -
                <img alt="Homepage" src="https://github.com/deepseek-ai/DeepSeek-V2/blob/main/figures/badge.svg?raw=true" />
         
     | 
| 19 | 
         
            -
              </a>
         
     | 
| 20 | 
         
            -
              <a href="https://huggingface.co/deepseek-ai/DeepSeek-OCR" target="_blank">
         
     | 
| 21 | 
         
            -
                <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-DeepSeek%20AI-ffc107?color=ffc107&logoColor=white" />
         
     | 
| 22 | 
         
            -
              </a>
         
     | 
| 23 | 
         
            -
             
     | 
| 24 | 
         
            -
            </div>
         
     | 
| 25 | 
         
            -
             
     | 
| 26 | 
         
            -
            <div align="center">
         
     | 
| 27 | 
         
            -
             
     | 
| 28 | 
         
            -
              <a href="https://discord.gg/Tc7c45Zzu5" target="_blank">
         
     | 
| 29 | 
         
            -
                <img alt="Discord" src="https://img.shields.io/badge/Discord-DeepSeek%20AI-7289da?logo=discord&logoColor=white&color=7289da" />
         
     | 
| 30 | 
         
            -
              </a>
         
     | 
| 31 | 
         
            -
              <a href="https://twitter.com/deepseek_ai" target="_blank">
         
     | 
| 32 | 
         
            -
                <img alt="Twitter Follow" src="https://img.shields.io/badge/Twitter-deepseek_ai-white?logo=x&logoColor=white" />
         
     | 
| 33 | 
         
            -
              </a>
         
     | 
| 34 | 
         
            -
             
     | 
| 35 | 
         
            -
            </div>
         
     | 
| 36 | 
         
            -
             
     | 
| 37 | 
         
            -
             
     | 
| 38 | 
         
            -
             
     | 
| 39 | 
         
            -
            <p align="center">
         
     | 
| 40 | 
         
            -
              <a href="https://github.com/deepseek-ai/DeepSeek-OCR"><b>🌟 Github</b></a> |
         
     | 
| 41 | 
         
            -
              <a href="https://huggingface.co/deepseek-ai/DeepSeek-OCR"><b>📥 Model Download</b></a> |
         
     | 
| 42 | 
         
            -
              <a href="https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek_OCR_paper.pdf"><b>📄 Paper Link</b></a> |
         
     | 
| 43 | 
         
            -
              <a href=""><b>📄 Arxiv Paper Link</b></a> |
         
     | 
| 44 | 
         
            -
            </p>
         
     | 
| 45 | 
         
            -
            <h2>
         
     | 
| 46 | 
         
            -
            <p align="center">
         
     | 
| 47 | 
         
            -
              <a href="">DeepSeek-OCR: Contexts Optical Compression</a>
         
     | 
| 48 | 
         
            -
            </p>
         
     | 
| 49 | 
         
            -
            </h2>
         
     | 
| 50 | 
         
            -
            <p align="center">
         
     | 
| 51 | 
         
            -
            <img src="assets/fig1.png" style="width: 1000px" align=center>
         
     | 
| 52 | 
         
            -
            </p>
         
     | 
| 53 | 
         
            -
            <p align="center">
         
     | 
| 54 | 
         
            -
            <a href="">Explore the boundaries of visual-text compression.</a>       
         
     | 
| 55 | 
         
            -
            </p>
         
     | 
| 56 | 
         
            -
             
     | 
| 57 | 
         
            -
            ## Usage
         
     | 
| 58 | 
         
            -
            Inference using Huggingface transformers on NVIDIA GPUs. Requirements tested on python 3.12.9 + CUDA11.8:
         
     | 
| 59 | 
         
            -
             
     | 
| 60 | 
         
            -
            ```
         
     | 
| 61 | 
         
            -
            torch==2.6.0
         
     | 
| 62 | 
         
            -
            transformers==4.46.3
         
     | 
| 63 | 
         
            -
            tokenizers==0.20.3
         
     | 
| 64 | 
         
            -
            einops
         
     | 
| 65 | 
         
            -
            addict 
         
     | 
| 66 | 
         
            -
            easydict
         
     | 
| 67 | 
         
            -
            pip install flash-attn==2.7.3 --no-build-isolation
         
     | 
| 68 | 
         
            -
            ```
         
     | 
| 69 | 
         
            -
             
     | 
| 70 | 
         
            -
            ```python
         
     | 
| 71 | 
         
            -
            from transformers import AutoModel, AutoTokenizer
         
     | 
| 72 | 
         
            -
            import torch
         
     | 
| 73 | 
         
            -
            import os
         
     | 
| 74 | 
         
            -
            os.environ["CUDA_VISIBLE_DEVICES"] = '0'
         
     | 
| 75 | 
         
            -
            model_name = 'deepseek-ai/DeepSeek-OCR'
         
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
            tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         
     | 
| 78 | 
         
            -
            model = AutoModel.from_pretrained(model_name, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True)
         
     | 
| 79 | 
         
            -
            model = model.eval().cuda().to(torch.bfloat16)
         
     | 
| 80 | 
         
            -
             
     | 
| 81 | 
         
            -
            # prompt = "<image>\nFree OCR. "
         
     | 
| 82 | 
         
            -
            prompt = "<image>\n<|grounding|>Convert the document to markdown. "
         
     | 
| 83 | 
         
            -
            image_file = 'your_image.jpg'
         
     | 
| 84 | 
         
            -
            output_path = 'your/output/dir'
         
     | 
| 85 | 
         
            -
             
     | 
| 86 | 
         
            -
            # infer(self, tokenizer, prompt='', image_file='', output_path = ' ', base_size = 1024, image_size = 640, crop_mode = True, test_compress = False, save_results = False):
         
     | 
| 87 | 
         
            -
             
     | 
| 88 | 
         
            -
            # Tiny: base_size = 512, image_size = 512, crop_mode = False
         
     | 
| 89 | 
         
            -
            # Small: base_size = 640, image_size = 640, crop_mode = False
         
     | 
| 90 | 
         
            -
            # Base: base_size = 1024, image_size = 1024, crop_mode = False
         
     | 
| 91 | 
         
            -
            # Large: base_size = 1280, image_size = 1280, crop_mode = False
         
     | 
| 92 | 
         
            -
             
     | 
| 93 | 
         
            -
            # Gundam: base_size = 1024, image_size = 640, crop_mode = True
         
     | 
| 94 | 
         
            -
             
     | 
| 95 | 
         
            -
            res = model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 640, crop_mode=True, save_results = True, test_compress = True)
         
     | 
| 96 | 
         
            -
            ```
         
     | 
| 97 | 
         
            -
             
     | 
| 98 | 
         
            -
            ## vLLM
         
     | 
| 99 | 
         
            -
            Refer to [🌟GitHub](https://github.com/deepseek-ai/DeepSeek-OCR/) for guidance on model inference acceleration and PDF processing, etc.<!--  -->
         
     | 
| 100 | 
         
            -
             
     | 
| 101 | 
         
            -
            ## Visualizations
         
     | 
| 102 | 
         
            -
            <table>
         
     | 
| 103 | 
         
            -
            <tr>
         
     | 
| 104 | 
         
            -
            <td><img src="assets/show1.jpg" style="width: 500px"></td>
         
     | 
| 105 | 
         
            -
            <td><img src="assets/show2.jpg" style="width: 500px"></td>
         
     | 
| 106 | 
         
            -
            </tr>
         
     | 
| 107 | 
         
            -
            <tr>
         
     | 
| 108 | 
         
            -
            <td><img src="assets/show3.jpg" style="width: 500px"></td>
         
     | 
| 109 | 
         
            -
            <td><img src="assets/show4.jpg" style="width: 500px"></td>
         
     | 
| 110 | 
         
            -
            </tr>
         
     | 
| 111 | 
         
            -
            </table>
         
     | 
| 112 | 
         
            -
             
     | 
| 113 | 
         
            -
             
     | 
| 114 | 
         
            -
            ## Acknowledgement
         
     | 
| 115 | 
         
            -
             
     | 
| 116 | 
         
            -
            We would like to thank [Vary](https://github.com/Ucas-HaoranWei/Vary/), [GOT-OCR2.0](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/), [MinerU](https://github.com/opendatalab/MinerU), [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [OneChart](https://github.com/LingyvKong/OneChart), [Slow Perception](https://github.com/Ucas-HaoranWei/Slow-Perception) for their valuable models and ideas.
         
     | 
| 117 | 
         
            -
             
     | 
| 118 | 
         
            -
            We also appreciate the benchmarks: [Fox](https://github.com/ucaslcl/Fox), [OminiDocBench](https://github.com/opendatalab/OmniDocBench).
         
     | 
| 119 | 
         
            -
             
     | 
| 120 | 
         
            -
             
     | 
| 121 | 
         
            -
            ## Citation
         
     | 
| 122 | 
         
            -
            Coming soon!
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         |