Spaces:

PaulHilders
/

IEAI_CLIPGroundingExplainability

Runtime error

IEAI_CLIPGroundingExplainability / app.py

paul hilders

Add link to github

9fa77d1 over 3 years ago

7.61 kB

	import sys
	import gradio as gr

	# sys.path.append("../")
	sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")

	import torch
	import CLIP.clip as clip

	import spacy
	from PIL import Image, ImageFont, ImageDraw, ImageOps

	import os
	os.system('python -m spacy download en_core_web_sm')


	from clip_grounding.utils.image import pad_to_square
	from clip_grounding.datasets.png import (
	overlay_relevance_map_on_image,
	)
	from CLIP_explainability.utils import interpret, show_img_heatmap, show_heatmap_on_text

	clip.clip._MODELS = {
	"ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
	"ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
	}

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

	# nlp = spacy.load("en_core_web_sm")
	import en_core_web_sm
	nlp = en_core_web_sm.load()

	# Gradio Section:
	def run_demo(image, text):
	orig_image = pad_to_square(image)
	img = preprocess(orig_image).unsqueeze(0).to(device)
	text_input = clip.tokenize([text]).to(device)

	R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device)

	image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device, show=False)
	overlapped = overlay_relevance_map_on_image(image, image_relevance)

	text_scores, text_tokens_decoded = show_heatmap_on_text(text, text_input, R_text[0], show=False)

	highlighted_text = []
	for i, token in enumerate(text_tokens_decoded):
	highlighted_text.append((str(token), float(text_scores[i])))

	return overlapped, highlighted_text


	# Default demo:
	input_img = gr.inputs.Image(type='pil', label="Original Image")
	input_txt = "text"
	inputs = [input_img, input_txt]

	outputs = [gr.inputs.Image(type='pil', label="Output Image"), "highlight"]


	description = """A demonstration based on the Generic Attention-model Explainability method for Interpreting Bi-Modal
	Transformers by Chefer et al. (2021): https://github.com/hila-chefer/Transformer-MM-Explainability.
	<br> <br>
	This demo shows attributions scores on both the image and the text input when presenting CLIP with a
	<text,image> pair. Attributions are computed as Gradient-weighted Attention Rollout (Chefer et al.,
	2021), and can be thought of as an estimate of the effective attention CLIP pays to its input when
	computing a multimodal representation. <span style="color:red">Warning:</span> Note that attribution
	methods such as the one from this demo can only give an estimate of the real underlying behavior
	of the model."""

	iface = gr.Interface(fn=run_demo,
	inputs=inputs,
	outputs=outputs,
	title="CLIP Grounding Explainability",
	description=description,
	examples=[["example_images/London.png", "London Eye"],
	["example_images/London.png", "Big Ben"],
	["example_images/harrypotter.png", "Harry"],
	["example_images/harrypotter.png", "Hermione"],
	["example_images/harrypotter.png", "Ron"],
	["example_images/Amsterdam.png", "Amsterdam canal"],
	["example_images/Amsterdam.png", "Old buildings"],
	["example_images/Amsterdam.png", "Pink flowers"],
	["example_images/dogs_on_bed.png", "Two dogs"],
	["example_images/dogs_on_bed.png", "Book"],
	["example_images/dogs_on_bed.png", "Cat"]])

	# NER demo:
	def add_label_to_img(img, label, add_entity_label=True):
	img = ImageOps.expand(img, border=45, fill=(255,255,255))
	draw = ImageDraw.Draw(img)
	font = ImageFont.truetype("arial.ttf", 24)
	if add_entity_label:
	draw.text((5,5), f"Entity: {str(label)}" , align="center", fill=(0, 0, 0), font=font)
	else:
	draw.text((5,5), str(label), align="center", fill=(0, 0, 0), font=font)

	return img

	def NER_demo(image, text):
	# Apply NER to extract named entities, and run the explainability method
	# for each named entity.
	highlighed_entities = []
	for ent in nlp(text).ents:
	ent_text = ent.text
	ent_label = ent.label_
	highlighed_entities.append((ent_text, ent_label))

	# As the default image, we run the default demo on the input image and text:
	overlapped, highlighted_text = run_demo(image, text)

	# Then, we run the demo for each of the named entities:
	gallery_images = [add_label_to_img(overlapped, "Full explanation", add_entity_label=False)]
	for ent_text, ent_label in highlighed_entities:
	overlapped_ent, highlighted_text_ent = run_demo(image, ent_text)
	overlapped_ent_labelled = add_label_to_img(overlapped_ent, f"{str(ent_text)} ({str(ent_label)})")

	gallery_images.append(overlapped_ent_labelled)

	return highlighed_entities, gallery_images

	input_img_NER = gr.inputs.Image(type='pil', label="Original Image")
	input_txt_NER = "text"
	inputs_NER = [input_img_NER, input_txt_NER]

	outputs_NER = ["highlight", gr.Gallery(type='pil', label="NER Entity explanations")]

	description_NER = """Automatically generated CLIP grounding explanations for
	named entities, retrieved from the spacy NER model. <span style="color:red">Warning:</span> Note
	that attribution methods such as the one from this demo can only give an estimate of the real
	underlying behavior of the model."""

	iface_NER = gr.Interface(fn=NER_demo,
	inputs=inputs_NER,
	outputs=outputs_NER,
	title="Named Entity Grounding explainability using CLIP",
	description=description_NER,
	examples=[["example_images/London.png", "In this image we see Big Ben and the London Eye, on both sides of the river Thames."]],
	cache_examples=False)

	demo_tabs = gr.TabbedInterface([iface, iface_NER], ["Default", "NER"])

	with demo_tabs:
	gr.Markdown("""
	### Acknowledgements
	This demo was developed for the Interpretability & Explainability in AI course at the University of
	Amsterdam. We would like to express our thanks to Jelle Zuidema, Jaap Jumelet, Tom Kersten, Christos
	Athanasiadis, Peter Heemskerk, Zhi Zhang, and all the other TAs who helped us during this course.

	---
	### References
	\[1\]: Chefer, H., Gur, S., & Wolf, L. (2021). Generic attention-model explainability for interpreting bi-modal and encoder-decoder transformers. <br>
	\[2\]: Abnar, S., & Zuidema, W. (2020). Quantifying attention flow in transformers. arXiv preprint arXiv:2005.00928. <br>
	\[3\]: [https://samiraabnar.github.io/articles/2020-04/attention_flow](https://samiraabnar.github.io/articles/2020-04/attention_flow) <br>
	""")
	demo_tabs.launch(show_error=True)