Spaces:
Running
on
Zero
Running
on
Zero
v1 demo.
Browse files- app.py +17 -65
- videollama2/model/multimodal_projector/builder.py +1 -1
app.py
CHANGED
|
@@ -19,7 +19,7 @@ from videollama2.mm_utils import KeywordsStoppingCriteria, tokenizer_MMODAL_toke
|
|
| 19 |
title_markdown = ("""
|
| 20 |
<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
|
| 21 |
<a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
|
| 22 |
-
<img src="https://s2.loli.net/2024/06/03/D3NeXHWy5az9tmT.png" alt="
|
| 23 |
</a>
|
| 24 |
<div>
|
| 25 |
<h1 >VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs</h1>
|
|
@@ -89,9 +89,8 @@ class Chat:
|
|
| 89 |
# 2. text preprocess (tag process & generate prompt).
|
| 90 |
state = self.get_prompt(prompt, state)
|
| 91 |
prompt = state.get_prompt()
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
input_ids = tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[modals[0]], return_tensors='pt').unsqueeze(0).to(self.model.device)
|
| 95 |
|
| 96 |
# 3. generate response according to visual signals and prompts.
|
| 97 |
stop_str = self.conv.sep if self.conv.sep_style in [SeparatorStyle.SINGLE] else self.conv.sep2
|
|
@@ -116,19 +115,6 @@ class Chat:
|
|
| 116 |
return outputs, state
|
| 117 |
|
| 118 |
|
| 119 |
-
def save_image_to_local(image):
|
| 120 |
-
filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.jpg')
|
| 121 |
-
image = Image.open(image)
|
| 122 |
-
image.save(filename)
|
| 123 |
-
return filename
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
def save_video_to_local(video_path):
|
| 127 |
-
filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
|
| 128 |
-
shutil.copyfile(video_path, filename)
|
| 129 |
-
return filename
|
| 130 |
-
|
| 131 |
-
|
| 132 |
@spaces.GPU(duration=120)
|
| 133 |
def generate(image, video, first_run, state, state_, textbox_in, dtype=torch.float16):
|
| 134 |
flag = 1
|
|
@@ -180,14 +166,10 @@ def generate(image, video, first_run, state, state_, textbox_in, dtype=torch.flo
|
|
| 180 |
text_en_out = text_en_out.split('#')[0]
|
| 181 |
textbox_out = text_en_out
|
| 182 |
|
| 183 |
-
print(image, video)
|
| 184 |
-
|
| 185 |
show_images = ""
|
| 186 |
if os.path.exists(image):
|
| 187 |
-
# filename = save_image_to_local(image)
|
| 188 |
show_images += f'<img src="./file={image}" style="display: inline-block;width: 250px;max-height: 400px;">'
|
| 189 |
if os.path.exists(video):
|
| 190 |
-
# filename = save_video_to_local(video)
|
| 191 |
show_images += f'<video controls playsinline width="500" style="display: inline-block;" src="./file={video}"></video>'
|
| 192 |
|
| 193 |
if flag:
|
|
@@ -215,58 +197,30 @@ def clear_history(state, state_):
|
|
| 215 |
state.to_gradio_chatbot(), \
|
| 216 |
True, state, state_, gr.update(value=None, interactive=True))
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
conv_mode = "llama_2"
|
| 220 |
model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
|
| 221 |
|
| 222 |
-
def find_cuda():
|
| 223 |
-
# Check if CUDA_HOME or CUDA_PATH environment variables are set
|
| 224 |
-
cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
|
| 225 |
-
|
| 226 |
-
if cuda_home and os.path.exists(cuda_home):
|
| 227 |
-
return cuda_home
|
| 228 |
-
|
| 229 |
-
# Search for the nvcc executable in the system's PATH
|
| 230 |
-
nvcc_path = shutil.which('nvcc')
|
| 231 |
-
|
| 232 |
-
if nvcc_path:
|
| 233 |
-
# Remove the 'bin/nvcc' part to get the CUDA installation path
|
| 234 |
-
cuda_path = os.path.dirname(os.path.dirname(nvcc_path))
|
| 235 |
-
return cuda_path
|
| 236 |
-
|
| 237 |
-
return None
|
| 238 |
-
|
| 239 |
-
cuda_path = find_cuda()
|
| 240 |
-
|
| 241 |
-
if cuda_path:
|
| 242 |
-
print(f"CUDA installation found at: {cuda_path}")
|
| 243 |
-
else:
|
| 244 |
-
print("CUDA installation not found")
|
| 245 |
-
|
| 246 |
device = torch.device("cuda")
|
| 247 |
|
| 248 |
handler = Chat(model_path, conv_mode=conv_mode, load_8bit=False, load_4bit=True)
|
| 249 |
-
# handler.model.to(dtype=torch.float16)
|
| 250 |
-
# handler = handler.model.to(device)
|
| 251 |
|
| 252 |
-
|
| 253 |
-
os.makedirs("temp")
|
| 254 |
|
| 255 |
-
|
| 256 |
-
show_label=False, placeholder="Enter text and press ENTER", container=False
|
| 257 |
-
)
|
| 258 |
-
with gr.Blocks(title='VideoLLaMA2π', theme=gr.themes.Default(), css=block_css) as demo:
|
| 259 |
gr.Markdown(title_markdown)
|
| 260 |
state = gr.State()
|
| 261 |
state_ = gr.State()
|
| 262 |
first_run = gr.State()
|
| 263 |
-
# tensor = gr.State()
|
| 264 |
-
# modals = gr.State()
|
| 265 |
|
| 266 |
with gr.Row():
|
| 267 |
with gr.Column(scale=3):
|
| 268 |
image = gr.Image(label="Input Image", type="filepath")
|
| 269 |
-
video = gr.Video(label="Input Video")
|
| 270 |
|
| 271 |
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
| 272 |
gr.Examples(
|
|
@@ -288,19 +242,19 @@ with gr.Blocks(title='VideoLLaMA2π', theme=gr.themes.Default(), css=block_css
|
|
| 288 |
)
|
| 289 |
|
| 290 |
with gr.Column(scale=7):
|
| 291 |
-
chatbot = gr.Chatbot(label="
|
| 292 |
with gr.Row():
|
| 293 |
with gr.Column(scale=8):
|
| 294 |
textbox.render()
|
| 295 |
with gr.Column(scale=1, min_width=50):
|
| 296 |
submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
|
| 297 |
with gr.Row(elem_id="buttons") as button_row:
|
| 298 |
-
upvote_btn
|
| 299 |
-
downvote_btn
|
| 300 |
-
# flag_btn
|
| 301 |
-
# stop_btn
|
| 302 |
regenerate_btn = gr.Button(value="π Regenerate", interactive=True)
|
| 303 |
-
clear_btn
|
| 304 |
|
| 305 |
gr.Markdown(tos_markdown)
|
| 306 |
gr.Markdown(learn_more_markdown)
|
|
@@ -308,9 +262,7 @@ with gr.Blocks(title='VideoLLaMA2π', theme=gr.themes.Default(), css=block_css
|
|
| 308 |
submit_btn.click(
|
| 309 |
generate,
|
| 310 |
[image, video, first_run, state, state_, textbox],
|
| 311 |
-
[image, video, chatbot, first_run, state, state_, textbox
|
| 312 |
-
# tensor, modals
|
| 313 |
-
])
|
| 314 |
|
| 315 |
regenerate_btn.click(
|
| 316 |
regenerate,
|
|
|
|
| 19 |
title_markdown = ("""
|
| 20 |
<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
|
| 21 |
<a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
|
| 22 |
+
<img src="https://s2.loli.net/2024/06/03/D3NeXHWy5az9tmT.png" alt="VideoLLaMA 2 π₯ππ₯" style="max-width: 120px; height: auto;">
|
| 23 |
</a>
|
| 24 |
<div>
|
| 25 |
<h1 >VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs</h1>
|
|
|
|
| 89 |
# 2. text preprocess (tag process & generate prompt).
|
| 90 |
state = self.get_prompt(prompt, state)
|
| 91 |
prompt = state.get_prompt()
|
| 92 |
+
input_ids = tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[modals[0]], return_tensors='pt')
|
| 93 |
+
input_ids = input_ids.unsqueeze(0).to(self.model.device)
|
|
|
|
| 94 |
|
| 95 |
# 3. generate response according to visual signals and prompts.
|
| 96 |
stop_str = self.conv.sep if self.conv.sep_style in [SeparatorStyle.SINGLE] else self.conv.sep2
|
|
|
|
| 115 |
return outputs, state
|
| 116 |
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
@spaces.GPU(duration=120)
|
| 119 |
def generate(image, video, first_run, state, state_, textbox_in, dtype=torch.float16):
|
| 120 |
flag = 1
|
|
|
|
| 166 |
text_en_out = text_en_out.split('#')[0]
|
| 167 |
textbox_out = text_en_out
|
| 168 |
|
|
|
|
|
|
|
| 169 |
show_images = ""
|
| 170 |
if os.path.exists(image):
|
|
|
|
| 171 |
show_images += f'<img src="./file={image}" style="display: inline-block;width: 250px;max-height: 400px;">'
|
| 172 |
if os.path.exists(video):
|
|
|
|
| 173 |
show_images += f'<video controls playsinline width="500" style="display: inline-block;" src="./file={video}"></video>'
|
| 174 |
|
| 175 |
if flag:
|
|
|
|
| 197 |
state.to_gradio_chatbot(), \
|
| 198 |
True, state, state_, gr.update(value=None, interactive=True))
|
| 199 |
|
| 200 |
+
# BUG of Zero Environment
|
| 201 |
+
# 1. The environment is fixed to torch==2.0.1+cu117, gradio>=4.x.x
|
| 202 |
+
# 2. The operation or tensor which requires cuda are limited in those functions wrapped via spaces.GPU
|
| 203 |
+
# 3. The function can't return tensor or other cuda objects.
|
| 204 |
|
| 205 |
conv_mode = "llama_2"
|
| 206 |
model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
device = torch.device("cuda")
|
| 209 |
|
| 210 |
handler = Chat(model_path, conv_mode=conv_mode, load_8bit=False, load_4bit=True)
|
|
|
|
|
|
|
| 211 |
|
| 212 |
+
textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
|
|
|
|
| 213 |
|
| 214 |
+
with gr.Blocks(title='VideoLLaMA 2 π₯ππ₯', theme=gr.themes.Default(), css=block_css) as demo:
|
|
|
|
|
|
|
|
|
|
| 215 |
gr.Markdown(title_markdown)
|
| 216 |
state = gr.State()
|
| 217 |
state_ = gr.State()
|
| 218 |
first_run = gr.State()
|
|
|
|
|
|
|
| 219 |
|
| 220 |
with gr.Row():
|
| 221 |
with gr.Column(scale=3):
|
| 222 |
image = gr.Image(label="Input Image", type="filepath")
|
| 223 |
+
video = gr.Video(label="Input Video", type="filepath")
|
| 224 |
|
| 225 |
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
| 226 |
gr.Examples(
|
|
|
|
| 242 |
)
|
| 243 |
|
| 244 |
with gr.Column(scale=7):
|
| 245 |
+
chatbot = gr.Chatbot(label="VideoLLaMA 2", bubble_full_width=True, height=750)
|
| 246 |
with gr.Row():
|
| 247 |
with gr.Column(scale=8):
|
| 248 |
textbox.render()
|
| 249 |
with gr.Column(scale=1, min_width=50):
|
| 250 |
submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
|
| 251 |
with gr.Row(elem_id="buttons") as button_row:
|
| 252 |
+
upvote_btn = gr.Button(value="π Upvote", interactive=True)
|
| 253 |
+
downvote_btn = gr.Button(value="π Downvote", interactive=True)
|
| 254 |
+
# flag_btn = gr.Button(value="β οΈ Flag", interactive=True)
|
| 255 |
+
# stop_btn = gr.Button(value="βΉοΈ Stop Generation", interactive=False)
|
| 256 |
regenerate_btn = gr.Button(value="π Regenerate", interactive=True)
|
| 257 |
+
clear_btn = gr.Button(value="ποΈ Clear history", interactive=True)
|
| 258 |
|
| 259 |
gr.Markdown(tos_markdown)
|
| 260 |
gr.Markdown(learn_more_markdown)
|
|
|
|
| 262 |
submit_btn.click(
|
| 263 |
generate,
|
| 264 |
[image, video, first_run, state, state_, textbox],
|
| 265 |
+
[image, video, chatbot, first_run, state, state_, textbox])
|
|
|
|
|
|
|
| 266 |
|
| 267 |
regenerate_btn.click(
|
| 268 |
regenerate,
|
videollama2/model/multimodal_projector/builder.py
CHANGED
|
@@ -20,7 +20,7 @@ import torch
|
|
| 20 |
import torch.nn as nn
|
| 21 |
import torch.nn.functional as F
|
| 22 |
from timm.models.regnet import RegStage
|
| 23 |
-
from timm.models.layers import
|
| 24 |
from transformers import TRANSFORMERS_CACHE
|
| 25 |
|
| 26 |
|
|
|
|
| 20 |
import torch.nn as nn
|
| 21 |
import torch.nn.functional as F
|
| 22 |
from timm.models.regnet import RegStage
|
| 23 |
+
from timm.models.layers import LayerNorm2d
|
| 24 |
from transformers import TRANSFORMERS_CACHE
|
| 25 |
|
| 26 |
|