Spaces:

bglearning
/

tapas-tokenizer-viz

Runtime error

App Files Files Community

bglearning commited on May 19, 2023

Commit

eb4710d

0 Parent(s):

First version

Browse files

Files changed (3) hide show

run_tapas_viz.py +23 -0
tapas-styles.css +38 -0
tapas_visualizer.py +139 -0

run_tapas_viz.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import pandas as pd
+from transformers import TapasTokenizer
+from tapas_visualizer import TapasVisualizer
+def main():
+    tapas_tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")
+    viz = TapasVisualizer(tapas_tokenizer)
+    data = {
+        "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+        "Age": ["56", "45", "59"],
+        "Number of movies": ["87", "53", "69"],
+    }
+    table = pd.DataFrame.from_dict(data)
+    print(viz(table))
+if __name__ == '__main__':
+    main()

tapas-styles.css ADDED Viewed

	@@ -0,0 +1,38 @@

+.tokenized-text {
+    width:100%;
+    padding:2rem;
+    max-height: 400px;
+    overflow-y: auto;
+    box-sizing:border-box;
+    line-height:4rem; /* Lots of space between lines */
+    font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
+    box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
+    background-color: rgba(0,0,0,0.01);
+    letter-spacing:2px; /* Give some extra separation between chars */
+}
+.non-token{
+    /* White space and other things the tokenizer ignores*/
+    white-space: pre;
+    letter-spacing:4px;
+    border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
+    border-bottom:1px solid #A0A0A0;
+    line-height: 1rem;
+    height: calc(100% - 2px);
+}
+.token {
+    white-space: pre;
+    position:relative;
+    color:black;
+    letter-spacing:2px;
+}
+.even-token{
+    background:#DCDCDC	;
+    border: 1px solid #DCDCDC;
+}
+.odd-token{
+    background:#A0A0A0;
+    border: 1px solid #A0A0A0;
+}

tapas_visualizer.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+from typing import Any, List
+from collections import defaultdict
+import pandas as pd
+dirname = os.path.dirname(__file__)
+css_filename = os.path.join(dirname, "tapas-styles.css")
+with open(css_filename) as f:
+    css = f.read()
+def HTMLBody(table_html: str, css_styles=css) -> str:
+    """
+    Generates the full html with css from a list of html spans
+    Args:
+        children (:obj:`List[str]`):
+            A list of strings, assumed to be html elements
+        css_styles (:obj:`str`, `optional`):
+            Optional alternative implementation of the css
+    Returns:
+        :obj:`str`: An HTML string with style markup
+    """
+    return f"""
+    <html>
+        <head>
+            <style>
+                {css_styles}
+            </style>
+        </head>
+        <body>
+            <div class="tokenized-text" dir=auto>
+            {table_html}
+            </div>
+        </body>
+    </html>
+    """
+class TapasVisualizer:
+    def __init__(self, tokenizer) -> None:
+        self.tokenizer = tokenizer
+    def normalize_token_str(self, token_str: str) -> str:
+        return token_str.replace("##", "")
+    def style_span(self, span_text: str, css_classes: List[str]) -> str:
+        css = f'''class="{' '.join(css_classes)}"'''
+        return f"<span {css} >{span_text}</span>"
+    def text_to_html(self, org_text: str, tokens: List[str]) -> str:
+        """Create html based on the original text and its tokens.
+        Note: The tokens need to be in same order as in the original text
+        Args:
+            org_text (str): Original string before tokenization
+            tokens (List[str]): The tokens of org_text
+        Returns:
+            str: html with styling for the tokens
+        """
+        if len(tokens) == 0:
+            print(f'Empty tokens for: {org_text}')
+            return ''
+        cur_token_id = 0
+        cur_token = self.normalize_token_str(tokens[cur_token_id])
+        # Loop through each character
+        next_start = 0
+        last_end = 0
+        spans = []
+        while next_start < len(org_text):
+            candidate = org_text[next_start: next_start + len(cur_token)]
+            # The tokenizer performs lowercasing; so check against lowercase
+            if candidate.lower() == cur_token:
+                if last_end != next_start:
+                    # There was token-less text (probably whitespace)
+                    # in the middle
+                    spans.append(self.style_span(org_text[last_end: next_start], ['non-token']))
+                odd_or_even = 'even-token' if cur_token_id % 2 == 0 else 'odd-token'
+                spans.append(self.style_span(candidate, ['token', odd_or_even]))
+                next_start += len(cur_token)
+                last_end = next_start
+                cur_token_id += 1
+                if cur_token_id >= len(tokens):
+                    break
+                cur_token = self.normalize_token_str(tokens[cur_token_id])
+            else:
+                next_start += 1
+        if last_end != len(org_text):
+            spans.append(self.style_span(org_text[last_end: next_start], ['non-token']))
+        return spans
+    def __call__(self, table: pd.DataFrame) -> Any:
+        tokenized = self.tokenizer(table)
+        cell_tokens = defaultdict(list)
+        for id_ind, input_id in enumerate(tokenized['input_ids']):
+            input_id = int(input_id)
+            # 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation' not required
+            segment_id, col_id, row_id, *_ = tokenized['token_type_ids'][id_ind]
+            token_text = self.tokenizer._convert_id_to_token(input_id)
+            cell_tokens[(row_id, col_id)].append(token_text)
+        # token_df = pd.DataFrame(token_data, columns=['id', 'token', 'segment_id', 'column_id', 'row_id'])
+        header_row_html = ""
+        for col_id, col in enumerate(table.columns, start=1):
+            span_htmls = self.text_to_html(col, cell_tokens[0, col_id])
+            cell_html = "".join(span_htmls)
+            header_row_html += f"<th>{cell_html}</th>"
+        header_row_html = f'<tr>{header_row_html}</tr>'
+        table_vals = table.values
+        table_html = header_row_html
+        for row_id, row in enumerate(table_vals, start=1):
+            row_html = ""
+            for col_id, cell in enumerate(row, start=1):
+                span_htmls = self.text_to_html(cell, cell_tokens[row_id, col_id])
+                cell_html = "".join(span_htmls)
+                row_html += f"<td>{cell_html}</td>"
+            table_html += f'<tr>{row_html}</tr>'
+        table_html = f'<table>{table_html}</table>'
+        return HTMLBody(table_html)