Spaces:

lixin4ever
/

VideoLLaMA2

Running on Zero

App Files Files Community

ClownRat commited on Jun 13, 2024

Commit

000c55e

1 Parent(s): b76f6c0

v1 demo.

Browse files

Files changed (2) hide show

app.py +17 -65
videollama2/model/multimodal_projector/builder.py +1 -1

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ from videollama2.mm_utils import KeywordsStoppingCriteria, tokenizer_MMODAL_toke
 title_markdown = ("""
 <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
   <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
-    <img src="https://s2.loli.net/2024/06/03/D3NeXHWy5az9tmT.png" alt="VideoLLaMA2🚀" style="max-width: 120px; height: auto;">
   </a>
   <div>
     <h1 >VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs</h1>
@@ -89,9 +89,8 @@ class Chat:
         # 2. text preprocess (tag process & generate prompt).
         state = self.get_prompt(prompt, state)
         prompt = state.get_prompt()
-        # print('\n\n\n')
-        # print(prompt)
-        input_ids = tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[modals[0]], return_tensors='pt').unsqueeze(0).to(self.model.device)
         # 3. generate response according to visual signals and prompts.
         stop_str = self.conv.sep if self.conv.sep_style in [SeparatorStyle.SINGLE] else self.conv.sep2
@@ -116,19 +115,6 @@ class Chat:
         return outputs, state
-def save_image_to_local(image):
-    filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.jpg')
-    image = Image.open(image)
-    image.save(filename)
-    return filename
-def save_video_to_local(video_path):
-    filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
-    shutil.copyfile(video_path, filename)
-    return filename
 @spaces.GPU(duration=120)
 def generate(image, video, first_run, state, state_, textbox_in, dtype=torch.float16):
     flag = 1
@@ -180,14 +166,10 @@ def generate(image, video, first_run, state, state_, textbox_in, dtype=torch.flo
     text_en_out = text_en_out.split('#')[0]
     textbox_out = text_en_out
-    print(image, video)
     show_images = ""
     if os.path.exists(image):
-        # filename = save_image_to_local(image)
         show_images += f'<img src="./file={image}" style="display: inline-block;width: 250px;max-height: 400px;">'
     if os.path.exists(video):
-        # filename = save_video_to_local(video)
         show_images += f'<video controls playsinline width="500" style="display: inline-block;"  src="./file={video}"></video>'
     if flag:
@@ -215,58 +197,30 @@ def clear_history(state, state_):
             state.to_gradio_chatbot(), \
             True, state, state_, gr.update(value=None, interactive=True))
 conv_mode = "llama_2"
 model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
-def find_cuda():
-    # Check if CUDA_HOME or CUDA_PATH environment variables are set
-    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
-    if cuda_home and os.path.exists(cuda_home):
-        return cuda_home
-    # Search for the nvcc executable in the system's PATH
-    nvcc_path = shutil.which('nvcc')
-    if nvcc_path:
-        # Remove the 'bin/nvcc' part to get the CUDA installation path
-        cuda_path = os.path.dirname(os.path.dirname(nvcc_path))
-        return cuda_path
-    return None
-cuda_path = find_cuda()
-if cuda_path:
-    print(f"CUDA installation found at: {cuda_path}")
-else:
-    print("CUDA installation not found")
 device = torch.device("cuda")
 handler = Chat(model_path, conv_mode=conv_mode, load_8bit=False, load_4bit=True)
-# handler.model.to(dtype=torch.float16)
-# handler = handler.model.to(device)
-if not os.path.exists("temp"):
-    os.makedirs("temp")
-textbox = gr.Textbox(
-    show_label=False, placeholder="Enter text and press ENTER", container=False
-)
-with gr.Blocks(title='VideoLLaMA2🚀', theme=gr.themes.Default(), css=block_css) as demo:
     gr.Markdown(title_markdown)
     state = gr.State()
     state_ = gr.State()
     first_run = gr.State()
-    # tensor = gr.State()
-    # modals = gr.State()
     with gr.Row():
         with gr.Column(scale=3):
             image = gr.Image(label="Input Image", type="filepath")
-            video = gr.Video(label="Input Video")
             cur_dir = os.path.dirname(os.path.abspath(__file__))
             gr.Examples(
@@ -288,19 +242,19 @@ with gr.Blocks(title='VideoLLaMA2🚀', theme=gr.themes.Default(), css=block_css
             )
         with gr.Column(scale=7):
-            chatbot = gr.Chatbot(label="VideoLLaMA2", bubble_full_width=True, height=750)
             with gr.Row():
                 with gr.Column(scale=8):
                     textbox.render()
                 with gr.Column(scale=1, min_width=50):
                     submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
             with gr.Row(elem_id="buttons") as button_row:
-                upvote_btn = gr.Button(value="👍  Upvote", interactive=True)
-                downvote_btn = gr.Button(value="👎  Downvote", interactive=True)
-                # flag_btn = gr.Button(value="⚠️  Flag", interactive=True)
-                # stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
                 regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
-                clear_btn = gr.Button(value="🗑️  Clear history", interactive=True)
     gr.Markdown(tos_markdown)
     gr.Markdown(learn_more_markdown)
@@ -308,9 +262,7 @@ with gr.Blocks(title='VideoLLaMA2🚀', theme=gr.themes.Default(), css=block_css
     submit_btn.click(
         generate,
         [image, video, first_run, state, state_, textbox],
-        [image, video, chatbot, first_run, state, state_, textbox,
-                    #  tensor, modals
-                     ])
     regenerate_btn.click(
         regenerate,

 title_markdown = ("""
 <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
   <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
+    <img src="https://s2.loli.net/2024/06/03/D3NeXHWy5az9tmT.png" alt="VideoLLaMA 2 🔥🚀🔥" style="max-width: 120px; height: auto;">
   </a>
   <div>
     <h1 >VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs</h1>
         # 2. text preprocess (tag process & generate prompt).
         state = self.get_prompt(prompt, state)
         prompt = state.get_prompt()
+        input_ids = tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[modals[0]], return_tensors='pt')
+        input_ids = input_ids.unsqueeze(0).to(self.model.device)
         # 3. generate response according to visual signals and prompts.
         stop_str = self.conv.sep if self.conv.sep_style in [SeparatorStyle.SINGLE] else self.conv.sep2
         return outputs, state
 @spaces.GPU(duration=120)
 def generate(image, video, first_run, state, state_, textbox_in, dtype=torch.float16):
     flag = 1
     text_en_out = text_en_out.split('#')[0]
     textbox_out = text_en_out
     show_images = ""
     if os.path.exists(image):
         show_images += f'<img src="./file={image}" style="display: inline-block;width: 250px;max-height: 400px;">'
     if os.path.exists(video):
         show_images += f'<video controls playsinline width="500" style="display: inline-block;"  src="./file={video}"></video>'
     if flag:
             state.to_gradio_chatbot(), \
             True, state, state_, gr.update(value=None, interactive=True))
+# BUG of Zero Environment
+# 1. The environment is fixed to torch==2.0.1+cu117, gradio>=4.x.x
+# 2. The operation or tensor which requires cuda are limited in those functions wrapped via spaces.GPU
+# 3. The function can't return tensor or other cuda objects.
 conv_mode = "llama_2"
 model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
 device = torch.device("cuda")
 handler = Chat(model_path, conv_mode=conv_mode, load_8bit=False, load_4bit=True)
+textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
+with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=gr.themes.Default(), css=block_css) as demo:
     gr.Markdown(title_markdown)
     state = gr.State()
     state_ = gr.State()
     first_run = gr.State()
     with gr.Row():
         with gr.Column(scale=3):
             image = gr.Image(label="Input Image", type="filepath")
+            video = gr.Video(label="Input Video", type="filepath")
             cur_dir = os.path.dirname(os.path.abspath(__file__))
             gr.Examples(
             )
         with gr.Column(scale=7):
+            chatbot = gr.Chatbot(label="VideoLLaMA 2", bubble_full_width=True, height=750)
             with gr.Row():
                 with gr.Column(scale=8):
                     textbox.render()
                 with gr.Column(scale=1, min_width=50):
                     submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
             with gr.Row(elem_id="buttons") as button_row:
+                upvote_btn     = gr.Button(value="👍  Upvote", interactive=True)
+                downvote_btn   = gr.Button(value="👎  Downvote", interactive=True)
+                # flag_btn     = gr.Button(value="⚠️  Flag", interactive=True)
+                # stop_btn     = gr.Button(value="⏹️  Stop Generation", interactive=False)
                 regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
+                clear_btn      = gr.Button(value="🗑️  Clear history", interactive=True)
     gr.Markdown(tos_markdown)
     gr.Markdown(learn_more_markdown)
     submit_btn.click(
         generate,
         [image, video, first_run, state, state_, textbox],
+        [image, video, chatbot, first_run, state, state_, textbox])
     regenerate_btn.click(
         regenerate,

videollama2/model/multimodal_projector/builder.py CHANGED Viewed

@@ -20,7 +20,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from timm.models.regnet import RegStage
-from timm.models.layers import LayerNorm, LayerNorm2d
 from transformers import TRANSFORMERS_CACHE

 import torch.nn as nn
 import torch.nn.functional as F
 from timm.models.regnet import RegStage
+from timm.models.layers import LayerNorm2d
 from transformers import TRANSFORMERS_CACHE