Spaces:
Running
on
Zero
Running
on
Zero
update app models(++)
Browse files
app.py
CHANGED
|
@@ -150,14 +150,15 @@ model_o = AutoModelForVision2Seq.from_pretrained(
|
|
| 150 |
MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
|
| 151 |
).to(device).eval()
|
| 152 |
|
| 153 |
-
# ---
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
).to(device).eval()
|
|
|
|
| 161 |
|
| 162 |
|
| 163 |
# --- PDF Generation and Preview Utility Function ---
|
|
@@ -250,6 +251,9 @@ def process_document_stream(
|
|
| 250 |
yield "Please enter a prompt.", ""
|
| 251 |
return
|
| 252 |
|
|
|
|
|
|
|
|
|
|
| 253 |
if model_name == "Moondream2(vision)":
|
| 254 |
image_embeds = moondream.encode_image(image)
|
| 255 |
answer = moondream.answer_question(
|
|
@@ -259,8 +263,51 @@ def process_document_stream(
|
|
| 259 |
)
|
| 260 |
yield answer, answer
|
| 261 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
-
#
|
|
|
|
|
|
|
| 264 |
if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
|
| 265 |
elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
|
| 266 |
elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
|
|
@@ -274,7 +321,6 @@ def process_document_stream(
|
|
| 274 |
elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
|
| 275 |
elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
|
| 276 |
elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
|
| 277 |
-
elif model_name == "SmolVLM2-500M-Video-Instruct(video)": processor, model = processor_sv, model_sv
|
| 278 |
else:
|
| 279 |
yield "Invalid model selected.", ""
|
| 280 |
return
|
|
@@ -333,7 +379,7 @@ def create_gradio_interface():
|
|
| 333 |
model_choice = gr.Dropdown(
|
| 334 |
choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
|
| 335 |
"VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
|
| 336 |
-
"LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)", "SmolVLM-500M-Instruct(smol)", "
|
| 337 |
label="Select Model", value= "LFM2-VL-450M(fast)"
|
| 338 |
)
|
| 339 |
prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")
|
|
|
|
| 150 |
MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
|
| 151 |
).to(device).eval()
|
| 152 |
|
| 153 |
+
# --- AIDC-AI/Ovis2-1B Model Loading ---
|
| 154 |
+
MODEL_ID_O2 = 'AIDC-AI/Ovis2-1B'
|
| 155 |
+
model_o2 = AutoModelForCausalLM.from_pretrained(
|
| 156 |
+
MODEL_ID_O2,
|
| 157 |
+
torch_dtype=torch.bfloat16,
|
| 158 |
+
multimodal_max_length=8192,
|
| 159 |
+
trust_remote_code=True
|
| 160 |
).to(device).eval()
|
| 161 |
+
text_tokenizer_o2 = model_o2.get_text_tokenizer()
|
| 162 |
|
| 163 |
|
| 164 |
# --- PDF Generation and Preview Utility Function ---
|
|
|
|
| 251 |
yield "Please enter a prompt.", ""
|
| 252 |
return
|
| 253 |
|
| 254 |
+
# --- Model-specific inference paths ---
|
| 255 |
+
|
| 256 |
+
# Moondream2 has a unique generation method
|
| 257 |
if model_name == "Moondream2(vision)":
|
| 258 |
image_embeds = moondream.encode_image(image)
|
| 259 |
answer = moondream.answer_question(
|
|
|
|
| 263 |
)
|
| 264 |
yield answer, answer
|
| 265 |
return
|
| 266 |
+
|
| 267 |
+
# Ovis2-1B has a custom preprocessing pipeline
|
| 268 |
+
elif model_name == "AIDC-AI/Ovis2-1B(ovis)":
|
| 269 |
+
conversations = [{"from": "human", "value": f"<image>\n{prompt_input}"}]
|
| 270 |
+
|
| 271 |
+
_, input_ids, pixel_values = model_o2.preprocess_inputs(conversations, [image], max_partition=16)
|
| 272 |
+
attention_mask = torch.ne(input_ids, text_tokenizer_o2.pad_token_id)
|
| 273 |
+
|
| 274 |
+
model_inputs = {
|
| 275 |
+
"inputs": input_ids.unsqueeze(0).to(device=model_o2.device),
|
| 276 |
+
"attention_mask": attention_mask.unsqueeze(0).to(device=model_o2.device),
|
| 277 |
+
"pixel_values": [pixel_values.to(dtype=torch.bfloat16, device=model_o2.device)]
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
streamer = TextIteratorStreamer(text_tokenizer_o2, skip_prompt=True, skip_special_tokens=True)
|
| 281 |
+
|
| 282 |
+
generation_kwargs = {
|
| 283 |
+
**model_inputs,
|
| 284 |
+
"streamer": streamer,
|
| 285 |
+
"max_new_tokens": max_new_tokens,
|
| 286 |
+
"do_sample": True,
|
| 287 |
+
"temperature": temperature,
|
| 288 |
+
"top_p": top_p,
|
| 289 |
+
"top_k": top_k,
|
| 290 |
+
"repetition_penalty": repetition_penalty,
|
| 291 |
+
"eos_token_id": model_o2.generation_config.eos_token_id,
|
| 292 |
+
"pad_token_id": text_tokenizer_o2.pad_token_id,
|
| 293 |
+
"use_cache": True
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
thread = Thread(target=model_o2.generate, kwargs=generation_kwargs)
|
| 297 |
+
thread.start()
|
| 298 |
+
|
| 299 |
+
buffer = ""
|
| 300 |
+
for new_text in streamer:
|
| 301 |
+
buffer += new_text
|
| 302 |
+
time.sleep(0.01)
|
| 303 |
+
yield buffer, buffer
|
| 304 |
+
|
| 305 |
+
yield buffer, buffer
|
| 306 |
+
return
|
| 307 |
|
| 308 |
+
# --- Standardized inference path for most other models ---
|
| 309 |
+
|
| 310 |
+
# Select model and processor
|
| 311 |
if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
|
| 312 |
elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
|
| 313 |
elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
|
|
|
|
| 321 |
elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
|
| 322 |
elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
|
| 323 |
elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
|
|
|
|
| 324 |
else:
|
| 325 |
yield "Invalid model selected.", ""
|
| 326 |
return
|
|
|
|
| 379 |
model_choice = gr.Dropdown(
|
| 380 |
choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
|
| 381 |
"VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
|
| 382 |
+
"LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)", "SmolVLM-500M-Instruct(smol)", "AIDC-AI/Ovis2-1B(ovis)"],
|
| 383 |
label="Select Model", value= "LFM2-VL-450M(fast)"
|
| 384 |
)
|
| 385 |
prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")
|