Spaces:
Runtime error
Runtime error
Commit
·
eb4710d
0
Parent(s):
First version
Browse files- run_tapas_viz.py +23 -0
- tapas-styles.css +38 -0
- tapas_visualizer.py +139 -0
run_tapas_viz.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
from transformers import TapasTokenizer
|
| 4 |
+
|
| 5 |
+
from tapas_visualizer import TapasVisualizer
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def main():
|
| 9 |
+
tapas_tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")
|
| 10 |
+
viz = TapasVisualizer(tapas_tokenizer)
|
| 11 |
+
|
| 12 |
+
data = {
|
| 13 |
+
"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
|
| 14 |
+
"Age": ["56", "45", "59"],
|
| 15 |
+
"Number of movies": ["87", "53", "69"],
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
table = pd.DataFrame.from_dict(data)
|
| 19 |
+
print(viz(table))
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
if __name__ == '__main__':
|
| 23 |
+
main()
|
tapas-styles.css
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.tokenized-text {
|
| 2 |
+
width:100%;
|
| 3 |
+
padding:2rem;
|
| 4 |
+
max-height: 400px;
|
| 5 |
+
overflow-y: auto;
|
| 6 |
+
box-sizing:border-box;
|
| 7 |
+
line-height:4rem; /* Lots of space between lines */
|
| 8 |
+
font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
|
| 9 |
+
box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
|
| 10 |
+
background-color: rgba(0,0,0,0.01);
|
| 11 |
+
letter-spacing:2px; /* Give some extra separation between chars */
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
.non-token{
|
| 15 |
+
/* White space and other things the tokenizer ignores*/
|
| 16 |
+
white-space: pre;
|
| 17 |
+
letter-spacing:4px;
|
| 18 |
+
border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
|
| 19 |
+
border-bottom:1px solid #A0A0A0;
|
| 20 |
+
line-height: 1rem;
|
| 21 |
+
height: calc(100% - 2px);
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
.token {
|
| 25 |
+
white-space: pre;
|
| 26 |
+
position:relative;
|
| 27 |
+
color:black;
|
| 28 |
+
letter-spacing:2px;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
.even-token{
|
| 32 |
+
background:#DCDCDC ;
|
| 33 |
+
border: 1px solid #DCDCDC;
|
| 34 |
+
}
|
| 35 |
+
.odd-token{
|
| 36 |
+
background:#A0A0A0;
|
| 37 |
+
border: 1px solid #A0A0A0;
|
| 38 |
+
}
|
tapas_visualizer.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Any, List
|
| 3 |
+
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
dirname = os.path.dirname(__file__)
|
| 9 |
+
css_filename = os.path.join(dirname, "tapas-styles.css")
|
| 10 |
+
with open(css_filename) as f:
|
| 11 |
+
css = f.read()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def HTMLBody(table_html: str, css_styles=css) -> str:
|
| 15 |
+
"""
|
| 16 |
+
Generates the full html with css from a list of html spans
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
children (:obj:`List[str]`):
|
| 20 |
+
A list of strings, assumed to be html elements
|
| 21 |
+
|
| 22 |
+
css_styles (:obj:`str`, `optional`):
|
| 23 |
+
Optional alternative implementation of the css
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
:obj:`str`: An HTML string with style markup
|
| 27 |
+
"""
|
| 28 |
+
return f"""
|
| 29 |
+
<html>
|
| 30 |
+
<head>
|
| 31 |
+
<style>
|
| 32 |
+
{css_styles}
|
| 33 |
+
</style>
|
| 34 |
+
</head>
|
| 35 |
+
<body>
|
| 36 |
+
<div class="tokenized-text" dir=auto>
|
| 37 |
+
{table_html}
|
| 38 |
+
</div>
|
| 39 |
+
</body>
|
| 40 |
+
</html>
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class TapasVisualizer:
|
| 45 |
+
def __init__(self, tokenizer) -> None:
|
| 46 |
+
self.tokenizer = tokenizer
|
| 47 |
+
|
| 48 |
+
def normalize_token_str(self, token_str: str) -> str:
|
| 49 |
+
return token_str.replace("##", "")
|
| 50 |
+
|
| 51 |
+
def style_span(self, span_text: str, css_classes: List[str]) -> str:
|
| 52 |
+
css = f'''class="{' '.join(css_classes)}"'''
|
| 53 |
+
return f"<span {css} >{span_text}</span>"
|
| 54 |
+
|
| 55 |
+
def text_to_html(self, org_text: str, tokens: List[str]) -> str:
|
| 56 |
+
"""Create html based on the original text and its tokens.
|
| 57 |
+
|
| 58 |
+
Note: The tokens need to be in same order as in the original text
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
org_text (str): Original string before tokenization
|
| 62 |
+
tokens (List[str]): The tokens of org_text
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
str: html with styling for the tokens
|
| 66 |
+
"""
|
| 67 |
+
if len(tokens) == 0:
|
| 68 |
+
print(f'Empty tokens for: {org_text}')
|
| 69 |
+
return ''
|
| 70 |
+
|
| 71 |
+
cur_token_id = 0
|
| 72 |
+
cur_token = self.normalize_token_str(tokens[cur_token_id])
|
| 73 |
+
|
| 74 |
+
# Loop through each character
|
| 75 |
+
next_start = 0
|
| 76 |
+
last_end = 0
|
| 77 |
+
spans = []
|
| 78 |
+
|
| 79 |
+
while next_start < len(org_text):
|
| 80 |
+
candidate = org_text[next_start: next_start + len(cur_token)]
|
| 81 |
+
|
| 82 |
+
# The tokenizer performs lowercasing; so check against lowercase
|
| 83 |
+
if candidate.lower() == cur_token:
|
| 84 |
+
if last_end != next_start:
|
| 85 |
+
# There was token-less text (probably whitespace)
|
| 86 |
+
# in the middle
|
| 87 |
+
spans.append(self.style_span(org_text[last_end: next_start], ['non-token']))
|
| 88 |
+
|
| 89 |
+
odd_or_even = 'even-token' if cur_token_id % 2 == 0 else 'odd-token'
|
| 90 |
+
spans.append(self.style_span(candidate, ['token', odd_or_even]))
|
| 91 |
+
next_start += len(cur_token)
|
| 92 |
+
last_end = next_start
|
| 93 |
+
cur_token_id += 1
|
| 94 |
+
if cur_token_id >= len(tokens):
|
| 95 |
+
break
|
| 96 |
+
cur_token = self.normalize_token_str(tokens[cur_token_id])
|
| 97 |
+
else:
|
| 98 |
+
next_start += 1
|
| 99 |
+
|
| 100 |
+
if last_end != len(org_text):
|
| 101 |
+
spans.append(self.style_span(org_text[last_end: next_start], ['non-token']))
|
| 102 |
+
|
| 103 |
+
return spans
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def __call__(self, table: pd.DataFrame) -> Any:
|
| 107 |
+
tokenized = self.tokenizer(table)
|
| 108 |
+
|
| 109 |
+
cell_tokens = defaultdict(list)
|
| 110 |
+
|
| 111 |
+
for id_ind, input_id in enumerate(tokenized['input_ids']):
|
| 112 |
+
input_id = int(input_id)
|
| 113 |
+
# 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation' not required
|
| 114 |
+
segment_id, col_id, row_id, *_ = tokenized['token_type_ids'][id_ind]
|
| 115 |
+
token_text = self.tokenizer._convert_id_to_token(input_id)
|
| 116 |
+
cell_tokens[(row_id, col_id)].append(token_text)
|
| 117 |
+
|
| 118 |
+
# token_df = pd.DataFrame(token_data, columns=['id', 'token', 'segment_id', 'column_id', 'row_id'])
|
| 119 |
+
header_row_html = ""
|
| 120 |
+
for col_id, col in enumerate(table.columns, start=1):
|
| 121 |
+
span_htmls = self.text_to_html(col, cell_tokens[0, col_id])
|
| 122 |
+
cell_html = "".join(span_htmls)
|
| 123 |
+
header_row_html += f"<th>{cell_html}</th>"
|
| 124 |
+
header_row_html = f'<tr>{header_row_html}</tr>'
|
| 125 |
+
|
| 126 |
+
table_vals = table.values
|
| 127 |
+
|
| 128 |
+
table_html = header_row_html
|
| 129 |
+
|
| 130 |
+
for row_id, row in enumerate(table_vals, start=1):
|
| 131 |
+
row_html = ""
|
| 132 |
+
for col_id, cell in enumerate(row, start=1):
|
| 133 |
+
span_htmls = self.text_to_html(cell, cell_tokens[row_id, col_id])
|
| 134 |
+
cell_html = "".join(span_htmls)
|
| 135 |
+
row_html += f"<td>{cell_html}</td>"
|
| 136 |
+
table_html += f'<tr>{row_html}</tr>'
|
| 137 |
+
|
| 138 |
+
table_html = f'<table>{table_html}</table>'
|
| 139 |
+
return HTMLBody(table_html)
|