Spaces:
Sleeping
Sleeping
| import editdistance | |
| from datasets import load_dataset | |
| from tqdm import tqdm | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from ..hf import detect_device | |
| MODEL_ID = "vikhyatk/moondream2" | |
| DEVICE, DTYPE = detect_device() | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| moondream = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| trust_remote_code=True, | |
| attn_implementation="flash_attention_2", | |
| torch_dtype=DTYPE, | |
| device_map={"": DEVICE}, | |
| ) | |
| moondream.eval() | |
| def get_anls(s1, s2): | |
| s1 = s1.lower().strip() | |
| s2 = s2.lower().strip() | |
| iou = 1 - editdistance.eval(s1, s2) / max(len(s1), len(s2)) | |
| anls = iou if iou >= 0.5 else 0.0 | |
| return anls | |
| docvqa_val = load_dataset("vikhyatk/docvqa", split="validation") | |
| scores = [] | |
| for row in tqdm(docvqa_val): | |
| image = row["image"] | |
| enc_image = moondream.encode_image(image) | |
| for qa in row["qa"]: | |
| question = qa["question"] | |
| answers = qa["answers"] | |
| prompt = f"{question}\nAnswer briefly with a single word or phrase." | |
| model_answer = moondream.answer_question(enc_image, prompt, tokenizer) | |
| anls = max(get_anls(model_answer, gt) for gt in answers) | |
| scores.append(anls) | |
| print("ANLS:", sum(scores) / len(scores)) | |