Spaces:

vk888
/

traffic-vqa-llm

Running

App Files Files Community

vk commited on Apr 9

Commit

43917f9

1 Parent(s): 5b777f6

first commit

Browse files

Files changed (6) hide show

.gitattributes +1 -0
.gitignore +2 -0
app.py +193 -0
download.py +5 -0
paligemma_tokenizer.model +3 -0
requirements.txt +9 -0

.gitattributes CHANGED Viewed

@@ -7,6 +7,7 @@
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text

 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ saved_model
2	+ big_vision_repo

app.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import os
+os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "1.0"
+if not os.path.exists("big_vision_repo"):
+    print('downloading big_vision_repo')
+    os.system("git clone --quiet --branch=main --depth=1 \
+    https://github.com/google-research/big_vision big_vision_repo")
+import sys
+if "big_vision_repo" not in sys.path:
+  sys.path.append("big_vision_repo")
+import functools
+import jax
+import numpy as np
+import ml_collections
+import tensorflow as tf
+import sentencepiece
+# Import model definition from big_vision
+from big_vision.models.proj.paligemma import paligemma
+from big_vision.trainers.proj.paligemma import predict_fns
+# Import big vision utilities
+import big_vision.datasets.jsonl
+import big_vision.utils
+import big_vision.sharding
+from glob import glob
+import cv2
+from time import time
+import gradio as gr
+def preprocess_image(image, size=224):
+  # Model has been trained to handle images of different aspects ratios
+  # resized to 224x224 in the range [-1, 1]. Bilinear and antialias resize
+  # options are helpful to improve quality in some tasks.
+  image = np.asarray(image)
+  if image.ndim == 2:  # Convert image without last channel into greyscale.
+    image = np.stack((image,)*3, axis=-1)
+  image = image[..., :3]  # Remove alpha layer.
+  assert image.shape[-1] == 3
+  image = tf.constant(image)
+  image = tf.image.resize(image, (size, size), method='bilinear', antialias=True)
+  return image.numpy() / 127.5 - 1.0  # [0, 255]->[-1,1]
+def preprocess_tokens(prefix, suffix=None, seqlen=None):
+  # Model has been trained to handle tokenized text composed of a prefix with
+  # full attention and a suffix with causal attention.
+  separator = "\n"
+  tokens = tokenizer.encode(prefix, add_bos=True) + tokenizer.encode(separator)
+  mask_ar = [0] * len(tokens)    # 0 to use full attention for prefix.
+  mask_loss = [0] * len(tokens)  # 0 to not use prefix tokens in the loss.
+  if suffix:
+    suffix = tokenizer.encode(suffix, add_eos=True)
+    tokens += suffix
+    mask_ar += [1] * len(suffix)    # 1 to use causal attention for suffix.
+    mask_loss += [1] * len(suffix)  # 1 to use suffix tokens in the loss.
+  mask_input = [1] * len(tokens)    # 1 if it's a token, 0 if padding.
+  if seqlen:
+    padding = [0] * max(0, seqlen - len(tokens))
+    tokens = tokens[:seqlen] + padding
+    mask_ar = mask_ar[:seqlen] + padding
+    mask_loss = mask_loss[:seqlen] + padding
+    mask_input = mask_input[:seqlen] + padding
+  return jax.tree.map(np.array, (tokens, mask_ar, mask_loss, mask_input))
+def postprocess_tokens(tokens):
+  tokens = tokens.tolist()  # np.array to list[int]
+  try:  # Remove tokens at and after EOS if any.
+    eos_pos = tokens.index(tokenizer.eos_id())
+    tokens = tokens[:eos_pos]
+  except ValueError:
+    pass
+  return tokenizer.decode(tokens)
+def get_response(image,prefix):
+    if len(prefix)<1:
+        prefix="caption en"
+    print('caption:',prefix)
+    image = preprocess_image(image)
+    examples = []
+    tokens, mask_ar, _, mask_input = preprocess_tokens(prefix, seqlen=SEQLEN)
+    examples.append({
+        "image": np.asarray(image),
+        "text": np.asarray(tokens),
+        "mask_ar": np.asarray(mask_ar),
+        "mask_input": np.asarray(mask_input),
+    })
+    examples[-1]["_mask"] = np.array(True)
+    batch = jax.tree.map(lambda *x: np.stack(x), *examples)
+    batch = big_vision.utils.reshard(batch, data_sharding)
+    # print('gonna predict')
+    start = time()
+    # Make model predictions
+    tokens = decode({"params": params}, batch=batch,
+                    max_decode_len=SEQLEN, sampler="greedy")
+    # print('predict done')
+    # Fetch model predictions to device and detokenize.
+    tokens, mask = jax.device_get((tokens, batch["_mask"]))
+    tokens = tokens[mask]  # remove padding examples.
+    responses = [postprocess_tokens(t) for t in tokens]
+    end = time()
+    print(responses)
+    print('\n')
+    print('Time elpased ', end - start)
+    return responses[0]
+def download_model():
+    print('downloading model')
+    os.system('gdown 1-HyAeenHhS0xu2m9-fvsZw5sGhytsf7s')
+def show_example(path):
+    return cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
+if __name__ == "__main__":
+    iface = gr.Interface(
+        cache_examples=False,
+        fn=get_response,
+        inputs=[gr.Image(type="numpy"), gr.Textbox(placeholder="caption en")],
+        examples=[[show_example('test-images/b20d494a-cdebe83e.jpg')],[show_example('test-images/b43eb946-b8bc931c.jpg')],[show_example('test-images/b7d13f97-74ae37ed.jpg')],[show_example('test-images/bce15cb0-2d6aec27.jpg')],[show_example('test-images/b5e6efc0-345b365d.jpg')]],
+        outputs=[gr.Textbox(label="Response")],
+        title="Traffic Understanding with Multi-modal LLM",
+        description="Traffic Understanding with Multi-modal LLM")
+    SEQLEN = 128
+    # Don't let TF use the GPU or TPUs
+    tf.config.set_visible_devices([], "GPU")
+    tf.config.set_visible_devices([], "TPU")
+    TOKENIZER_PATH = "paligemma_tokenizer.model"
+    MODEL_PATH = './traffic-vqa_ckpt_person1.npz'
+    # if not (os.path.exists(MODEL_PATH)):
+    #     download_model()
+    backend = jax.lib.xla_bridge.get_backend()
+    print(f"JAX version:  {jax.__version__}")
+    print(f"JAX platform: {backend.platform}")
+    print(f"JAX devices:  {jax.device_count()}")
+    LLM_VARIANT = "gemma2_2b"
+    model_config = ml_collections.FrozenConfigDict({
+        "llm": {"vocab_size": 257_152, "variant": LLM_VARIANT, "final_logits_softcap": 0.0},
+        "img": {"variant": "So400m/14", "pool_type": "none", "scan": True, "dtype_mm": "float16"}
+    })
+    model = paligemma.Model(**model_config)
+    tokenizer = sentencepiece.SentencePieceProcessor(TOKENIZER_PATH)
+    # Load params - this can take up to 1 minute in T4 colabs.
+    params = paligemma.load(None, MODEL_PATH, model_config)
+    # Define `decode` function to sample outputs from the model.
+    decode_fn = predict_fns.get_all(model)['decode']
+    decode = functools.partial(decode_fn, devices=jax.devices(), eos_token=tokenizer.eos_id())
+    mesh = jax.sharding.Mesh(jax.devices(), ("data"))
+    data_sharding = jax.sharding.NamedSharding(
+        mesh, jax.sharding.PartitionSpec("data"))
+    iface.launch()

download.py ADDED Viewed

	@@ -0,0 +1,5 @@


1	+ import os
2	+
3	+
4	+
5	+ wget --no-check-certificate "https://drive.google.com/uc?export=download&id=1-HyAeenHhS0xu2m9-fvsZw5sGhytsf7s" -O your_desired_filename.ext

paligemma_tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8986bb4f423f07f8c7f70d0dbe3526fb2316056c17bae71b1ea975e77a168fc6
+size 4264023

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gdown
+jax
+flax
+overrides
+ml_collections
+einops~=0.7
+sentencepiece
+tensorflow
+opencv_python