File size: 4,864 Bytes
918b491 a80aa31 918b491 4f8707c 918b491 4f8707c 918b491 57a688f 918b491 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
---
language:
- en
library_name: transformers
license: apache-2.0
metrics:
- accuracy
tags:
- multimodal
pipeline_tag: video-text-to-text
base_model: Qwen/Qwen2.5-VL-7B-Instruct
---
# π‘ VideoChat-R1_5-7B
[\[π GitHub\]](https://github.com/OpenGVLab/VideoChat-R1)
[\[π Tech Report\]](https://arxiv.org/pdf/2509.21100v1)
## π How to use the model
We provide a simple installation example below:
```
pip install transformers
```
Using qwen_vl_utils in https://github.com/OpenGVLab/VideoChat-R1/blob/main/Videochat-R1.5/src_eval/my_vision_process.py
Then you could use our model:
```python
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
model_path = "OpenGVLab/VideoChat-R1_5"
# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_path, torch_dtype="auto", device_map="auto",
attn_implementation="flash_attention_2"
)
# default processer
processor = AutoProcessor.from_pretrained(model_path)
video_path = "your_video.mp4"
question = "your_qa.mp4"
num_percptions = 3
QA_THINK_GLUE = """Answer the question: "[QUESTION]" according to the content of the video.
Output your think process within the <think> </think> tags.
Then, provide your answer within the <answer> </answer> tags, output the corresponding letter of the option. At the same time, in the <glue> </glue> tags, present the precise time period in seconds of the video clips on which you base your answer to this question in the format of [(s1, e1), (s2, e2), ...]. For example: <think>...</think><answer>A</answer><glue>[(5.2, 10.4)]</glue>.
"""
QA_THINK = """Answer the question: "[QUESTION]" according to the content of the video.
Output your think process within the <think> </think> tags.
Then, provide your answer within the <answer> </answer> tags, output the corresponding letter of the option. For example: <think>...</think><answer>A</answer><glue>[(5.2, 10.4)]</glue>.
"""
def inference(video_path, prompt, model, processor, max_new_tokens=2048, device="cuda:0", client = None, pred_glue=None):
messages = [
{"role": "user", "content": [
{"type": "video",
"video": video_path,
'key_time':pred_glue,
"total_pixels": 128*12 * 28 * 28,
"min_pixels": 128 * 28 * 28,
},
{"type": "text", "text": prompt},
]
},
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True, client = client)
fps_inputs = video_kwargs['fps']
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, fps=fps_inputs, padding=True, return_tensors="pt")
inputs = inputs.to(device)
with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True)
generated_ids = [output_ids[i][len(inputs.input_ids[i]):] for i in range(len(output_ids))]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
return output_text[0]
for percption in range(num_percptions):
if percption == num_percptions - 1:
example_prompt = QA_THINK.replace("[QUESTION]", item["problem"]["question"])
else:
example_prompt = QA_THINK_GLUE.replace("[QUESTION]", item["problem"]["question"])
ans = inference(video_path, example_prompt, model, processor, device=device, client=client, pred_glue=pred_glue)
pattern_glue = r'<glue>(.*?)</glue>'
match_glue = re.search(pattern_glue, ans, re.DOTALL)
# print(f'ann:{ans}')
answers.append(ans)
pred_glue = None
try:
if match_glue:
glue = match_glue.group(1)
pred_glue = ast.literal_eval(glue)
except Exception as e:
pred_glue = None
print(ans)
```
## βοΈ Citation
If you find this project useful in your research, please consider cite:
```BibTeX
@article{li2025videochatr1,
title={VideoChat-R1: Enhancing Spatio-Temporal
Perception via Reinforcement Fine-Tuning},
author={Li, Xinhao and Yan, Ziang and Meng, Desen and Dong, Lu and Zeng, Xiangyu and He, Yinan and Wang, Yali and Qiao, Yu and Wang, Yi and Wang, Limin},
journal={arXiv preprint arXiv:2504.06958},
year={2025}
}
@article{yan2025videochatr15,
title={VideoChat-R1.5: Visual Test-Time Scaling to Reinforce Multimodal Reasoning by Iterative Perception},
author={Yan, Ziang and Li, Xinhao and He, Yinan and Zhengrong Yue and Zeng, Xiangyu and Wang, Yali and Qiao, Yu and Wang, Limin and Wang, Yi},
journal={arXiv preprint arXiv:2509.21100},
year={2025}
}
``` |