Spaces:

tomg-group-umd
/

lm-watermarking

Sleeping

App Files Files Community

jwkirchenbauer commited on Feb 17, 2023

Commit

7d29596

1 Parent(s): dafc0b4

more polished interface

Browse files

Files changed (9) hide show

app.py +9 -3
demo_watermark.py +124 -55
homoglyph_data/__init__.py +40 -0
homoglyph_data/categories.json +0 -0
homoglyph_data/confusables_sept2022.json +0 -0
homoglyph_data/languages.json +34 -0
homoglyphs.py +11 -14
requirements.txt +0 -1
watermark_processor.py +2 -2

app.py CHANGED Viewed

@@ -19,9 +19,14 @@ args = Namespace()
 arg_dict = {
     'run_gradio': True,
-    'demo_public': False,
-    # 'model_name_or_path': 'facebook/opt-125m',
-    'model_name_or_path': 'facebook/opt-2.7b',
     'prompt_max_length': None,
     'max_new_tokens': 200,
     'generation_seed': 123,
@@ -36,6 +41,7 @@ arg_dict = {
     'ignore_repeated_bigrams': False,
     'detection_z_threshold': 4.0,
     'select_green_tokens': True,
     'skip_model_load': False,
     'seed_separately': True,
 }

 arg_dict = {
     'run_gradio': True,
+    # 'demo_public': False,
+    'demo_public': True,
+    'model_name_or_path': 'facebook/opt-125m',
+    # 'model_name_or_path': 'facebook/opt-1.3b',
+    # 'model_name_or_path': 'facebook/opt-2.7b',
+    # 'model_name_or_path': 'facebook/opt-6.7b',
+    # 'model_name_or_path': 'facebook/opt-13b',
+    # 'model_name_or_path': 'facebook/opt-30b',
     'prompt_max_length': None,
     'max_new_tokens': 200,
     'generation_seed': 123,
     'ignore_repeated_bigrams': False,
     'detection_z_threshold': 4.0,
     'select_green_tokens': True,
+    # 'skip_model_load': True,
     'skip_model_load': False,
     'seed_separately': True,
 }

demo_watermark.py CHANGED Viewed

@@ -250,6 +250,41 @@ def generate(prompt, args, model=None, device=None, tokenizer=None):
             args)
             # decoded_output_with_watermark)
 def detect(input_text, args, device=None, tokenizer=None):
     watermark_detector = WatermarkDetector(vocab=list(tokenizer.get_vocab().values()),
                                         gamma=args.gamma,
@@ -262,11 +297,13 @@ def detect(input_text, args, device=None, tokenizer=None):
                                         select_green_tokens=args.select_green_tokens)
     if len(input_text)-1 > watermark_detector.min_prefix_len:
         score_dict = watermark_detector.detect(input_text)
-        output_str = (f"Detection result @ {watermark_detector.z_threshold}:\n"
-                        f"{score_dict}")
     else:
-        output_str = (f"Error: string not long enough to compute watermark presence.")
-    return output_str, args
 def run_gradio(args, model=None, device=None, tokenizer=None):
@@ -276,33 +313,41 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
     with gr.Blocks() as demo:
         # Top section, greeting and instructions
-        gr.Markdown("## Demo for ['A Watermark for Large Language Models'](https://arxiv.org/abs/2301.10226)")
-        gr.HTML("""
-                <p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
-                <br/>
-                <a href="https://huggingface.co/spaces/tomg-group-umd/lm-watermarking?duplicate=true">
-                <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-                <p/>
-                """)
-        # Construct state for parameters, define updates and toggles, and register event listeners
         session_args = gr.State(value=args)
-        with gr.Tab("Generation"):
             with gr.Row():
-                prompt = gr.Textbox(label=f"Prompt", interactive=True)
             with gr.Row():
                 generate_btn = gr.Button("Generate")
             with gr.Row():
                 with gr.Column(scale=2):
-                    output_without_watermark = gr.Textbox(label="Output Without Watermark", interactive=False)
                 with gr.Column(scale=1):
-                    without_watermark_detection_result = gr.Textbox(label="Detection Result", interactive=False)
             with gr.Row():
                 with gr.Column(scale=2):
-                    output_with_watermark = gr.Textbox(label="Output With Watermark", interactive=False)
                 with gr.Column(scale=1):
-                    with_watermark_detection_result = gr.Textbox(label="Detection Result", interactive=False)
             redecoded_input = gr.Textbox(visible=False)
             truncation_warning = gr.Number(visible=False)
@@ -311,24 +356,16 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
                     return redecoded_input + f"\n\n[Prompt was truncated before generation due to length...]", args
                 else:
                     return orig_prompt, args
-            generate_btn.click(fn=generate_partial, inputs=[prompt,session_args], outputs=[redecoded_input, truncation_warning, output_without_watermark, output_with_watermark,session_args])
-            # Show truncated version of prompt if truncation occurred
-            redecoded_input.change(fn=truncate_prompt, inputs=[redecoded_input,truncation_warning,prompt,session_args], outputs=[prompt,session_args])
-            # Call detection when the outputs of the generate function are updated.
-            output_without_watermark.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
-            output_with_watermark.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
         with gr.Tab("Detector Only"):
             with gr.Row():
-                detection_input = gr.Textbox(label="Text to Analyze", interactive=True)
-            with gr.Row():
-                detect_btn = gr.Button("Detect")
             with gr.Row():
-                detection_result = gr.Textbox(label="Detection Result", interactive=False)
-            detect_btn.click(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_result, session_args])
         # Parameter selection group
         with gr.Accordion("Advanced Settings",open=False):
@@ -347,18 +384,23 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
                         max_new_tokens = gr.Slider(label="Max Generated Tokens", minimum=10, maximum=1000, step=10, value=args.max_new_tokens)
                 with gr.Column(scale=1):
-                    gr.Markdown(f"#### Watermarking Parameters")
                     with gr.Row():
                         gamma = gr.Slider(label="gamma",minimum=0.1, maximum=0.9, step=0.05, value=args.gamma)
                     with gr.Row():
                         delta = gr.Slider(label="delta",minimum=0.0, maximum=10.0, step=0.1, value=args.delta)
                     with gr.Row():
                         ignore_repeated_bigrams = gr.Checkbox(label="Ignore Bigram Repeats")
                     with gr.Row():
                         normalizers = gr.CheckboxGroup(label="Normalizations", choices=["unicode", "homoglyphs", "truecase"], value=args.normalizers)
-            gr.Markdown(f"_Note: sliders don't always update perfectly. Clicking on the bar or using the number window to the right can help._")
-            with gr.Accordion("Actual submitted parameters:",open=False):
-                current_parameters = gr.Textbox(label="submitted parameters", value=args)
             with gr.Accordion("Legacy Settings",open=False):
                 with gr.Row():
                     with gr.Column(scale=1):
@@ -366,23 +408,31 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
                     with gr.Column(scale=1):
                         select_green_tokens = gr.Checkbox(label="Select 'greenlist' from partition", value=args.select_green_tokens)
-        with gr.Accordion("A note on model capability",open=False):
-            gr.Markdown(
-                """
-                The models that can be used in this demo are limited to those that are open source as well as fit on a single commodity GPU. In particular, there are few models above 10B parameters and way fewer trained using both Instruction finetuning or RLHF that are open source that we can use.
-                Therefore, the model, in both it's un-watermarked (normal) and watermarked state, is not generally able to respond well to the kinds of prompts that a 100B+ Instruction and RLHF tuned model such as ChatGPT, Claude, or Bard is.
-                We suggest you try prompts that give the model a few sentences and then allow it to 'continue' the prompt, as these weaker models are more capable in this simpler language modeling setting.
-                """
-                )
-        # State manager logic
         def update_sampling_temp(session_state, value): session_state.sampling_temp = float(value); return session_state
         def update_generation_seed(session_state, value): session_state.generation_seed = int(value); return session_state
         def update_gamma(session_state, value): session_state.gamma = float(value); return session_state
         def update_delta(session_state, value): session_state.delta = float(value); return session_state
         def update_decoding(session_state, value):
             if value == "multinomial":
                 session_state.use_sampling = True
@@ -405,11 +455,11 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
         def update_normalizers(session_state, value): session_state.normalizers = value; return session_state
         def update_seed_separately(session_state, value): session_state.seed_separately = value; return session_state
         def update_select_green_tokens(session_state, value): session_state.select_green_tokens = value; return session_state
         decoding.change(toggle_sampling_vis,inputs=[decoding], outputs=[sampling_temp])
         decoding.change(toggle_sampling_vis,inputs=[decoding], outputs=[generation_seed])
         decoding.change(toggle_sampling_vis_inv,inputs=[decoding], outputs=[n_beams])
         decoding.change(update_decoding,inputs=[session_args, decoding], outputs=[session_args])
         sampling_temp.change(update_sampling_temp,inputs=[session_args, sampling_temp], outputs=[session_args])
         generation_seed.change(update_generation_seed,inputs=[session_args, generation_seed], outputs=[session_args])
@@ -417,17 +467,36 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
         max_new_tokens.change(update_max_new_tokens,inputs=[session_args, max_new_tokens], outputs=[session_args])
         gamma.change(update_gamma,inputs=[session_args, gamma], outputs=[session_args])
         delta.change(update_delta,inputs=[session_args, delta], outputs=[session_args])
         ignore_repeated_bigrams.change(update_ignore_repeated_bigrams,inputs=[session_args, ignore_repeated_bigrams], outputs=[session_args])
         normalizers.change(update_normalizers,inputs=[session_args, normalizers], outputs=[session_args])
         seed_separately.change(update_seed_separately,inputs=[session_args, seed_separately], outputs=[session_args])
         select_green_tokens.change(update_select_green_tokens,inputs=[session_args, select_green_tokens], outputs=[session_args])
         generate_btn.click(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
         detect_btn.click(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
-        # When the parameters change, also fire detection, since some detection params dont change the model output.
-        current_parameters.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
-        current_parameters.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
     demo.queue(concurrency_count=3)

             args)
             # decoded_output_with_watermark)
+def format_names(s):
+    s=s.replace("num_tokens_scored","Tokens Counted (T)")
+    s=s.replace("num_green_tokens","# Tokens in Greenlist")
+    s=s.replace("green_fraction","Fraction of T in Greenlist")
+    s=s.replace("z_score","z-score")
+    s=s.replace("p_value","p value")
+    return s
+# def str_format_scores(score_dict, detection_threshold):
+#     output_str = f"@ z-score threshold={detection_threshold}:\n\n"
+#     for k,v in score_dict.items():
+#         if k=='green_fraction':
+#             output_str+=f"{format_names(k)}={v:.1%}"
+#         elif k=='confidence':
+#             output_str+=f"{format_names(k)}={v:.3%}"
+#         elif isinstance(v, float):
+#             output_str+=f"{format_names(k)}={v:.3g}"
+#         else:
+#             output_str += v
+#     return output_str
+def list_format_scores(score_dict, detection_threshold):
+    lst_2d = []
+    lst_2d.append(["z-score threshold", f"{detection_threshold}"])
+    for k,v in score_dict.items():
+        if k=='green_fraction':
+            lst_2d.append([format_names(k), f"{v:.1%}"])
+        elif k=='confidence':
+            lst_2d.append([format_names(k), f"{v:.3%}"])
+        elif isinstance(v, float):
+            lst_2d.append([format_names(k), f"{v:.3g}"])
+        elif isinstance(v, bool):
+            lst_2d.append([format_names(k), ("Watermarked" if v else "Human/Unwatermarked")])
+        else:
+            lst_2d.append([format_names(k), f"{v}"])
+    return lst_2d
 def detect(input_text, args, device=None, tokenizer=None):
     watermark_detector = WatermarkDetector(vocab=list(tokenizer.get_vocab().values()),
                                         gamma=args.gamma,
                                         select_green_tokens=args.select_green_tokens)
     if len(input_text)-1 > watermark_detector.min_prefix_len:
         score_dict = watermark_detector.detect(input_text)
+        # output = str_format_scores(score_dict, watermark_detector.z_threshold)
+        output = list_format_scores(score_dict, watermark_detector.z_threshold)
     else:
+        # output = (f"Error: string not long enough to compute watermark presence.")
+        output = [["Error","string too short to compute metrics"]]
+        output += [["",""] for _ in range(6)]
+    return output, args
 def run_gradio(args, model=None, device=None, tokenizer=None):
     with gr.Blocks() as demo:
         # Top section, greeting and instructions
+        gr.Markdown("## 💧 [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) 🔍")
+        gr.Markdown("[jwkirchenbauer/lm-watermarking![](https://badgen.net/badge/icon/GitHub?icon=github&label)](https://github.com/jwkirchenbauer/lm-watermarking)")
+        with gr.Accordion("A note on model capability",open=False):
+            gr.Markdown(
+                """
+                The models that can be used in this demo are limited to those that are open source as well as fit on a single commodity GPU. In particular, there are few models above 10B parameters and way fewer trained using both Instruction finetuning or RLHF that are open source that we can use.
+                Therefore, the model, in both it's un-watermarked (normal) and watermarked state, is not generally able to respond well to the kinds of prompts that a 100B+ Instruction and RLHF tuned model such as ChatGPT, Claude, or Bard is.
+                We suggest you try prompts that give the model a few sentences and then allow it to 'continue' the prompt, as these weaker models are more capable in this simpler language modeling setting.
+                """
+                )
+        # Construct state for parameters, define updates and toggles
         session_args = gr.State(value=args)
+        with gr.Tab("Generate and Detect"):
             with gr.Row():
+                prompt = gr.Textbox(label=f"Prompt", interactive=True,lines=12,max_lines=12)
             with gr.Row():
                 generate_btn = gr.Button("Generate")
             with gr.Row():
                 with gr.Column(scale=2):
+                    output_without_watermark = gr.Textbox(label="Output Without Watermark", interactive=False,lines=12,max_lines=12)
                 with gr.Column(scale=1):
+                    # without_watermark_detection_result = gr.Textbox(label="Detection Result", interactive=False,lines=12,max_lines=12)
+                    without_watermark_detection_result = gr.Dataframe(headers=["Metric", "Value"], interactive=False,row_count=7,col_count=2)
             with gr.Row():
                 with gr.Column(scale=2):
+                    output_with_watermark = gr.Textbox(label="Output With Watermark", interactive=False,lines=12,max_lines=12)
                 with gr.Column(scale=1):
+                    # with_watermark_detection_result = gr.Textbox(label="Detection Result", interactive=False,lines=12,max_lines=12)
+                    with_watermark_detection_result = gr.Dataframe(headers=["Metric", "Value"],interactive=False,row_count=7,col_count=2)
             redecoded_input = gr.Textbox(visible=False)
             truncation_warning = gr.Number(visible=False)
                     return redecoded_input + f"\n\n[Prompt was truncated before generation due to length...]", args
                 else:
                     return orig_prompt, args
         with gr.Tab("Detector Only"):
             with gr.Row():
+                with gr.Column(scale=2):
+                    detection_input = gr.Textbox(label="Text to Analyze", interactive=True,lines=12,max_lines=12)
+                with gr.Column(scale=1):
+                    # detection_result = gr.Textbox(label="Detection Result", interactive=False,lines=12,max_lines=12)
+                    detection_result = gr.Dataframe(headers=["Metric", "Value"], interactive=False,row_count=7,col_count=2)
             with gr.Row():
+                    detect_btn = gr.Button("Detect")
         # Parameter selection group
         with gr.Accordion("Advanced Settings",open=False):
                         max_new_tokens = gr.Slider(label="Max Generated Tokens", minimum=10, maximum=1000, step=10, value=args.max_new_tokens)
                 with gr.Column(scale=1):
+                    gr.Markdown(f"#### Watermark Parameters")
                     with gr.Row():
                         gamma = gr.Slider(label="gamma",minimum=0.1, maximum=0.9, step=0.05, value=args.gamma)
                     with gr.Row():
                         delta = gr.Slider(label="delta",minimum=0.0, maximum=10.0, step=0.1, value=args.delta)
+                    gr.Markdown(f"#### Detector Parameters")
+                    with gr.Row():
+                        detection_z_threshold = gr.Slider(label="z-score threshold",minimum=0.0, maximum=10.0, step=0.1, value=args.detection_z_threshold)
                     with gr.Row():
                         ignore_repeated_bigrams = gr.Checkbox(label="Ignore Bigram Repeats")
                     with gr.Row():
                         normalizers = gr.CheckboxGroup(label="Normalizations", choices=["unicode", "homoglyphs", "truecase"], value=args.normalizers)
+            # with gr.Accordion("Actual submitted parameters:",open=False):
+            with gr.Row():
+                gr.Markdown(f"_Note: sliders don't always update perfectly. Clicking on the bar or using the number window to the right can help. Window below shows the current settings._")
+            with gr.Row():
+                current_parameters = gr.Textbox(label="Current Parameters", value=args)
             with gr.Accordion("Legacy Settings",open=False):
                 with gr.Row():
                     with gr.Column(scale=1):
                     with gr.Column(scale=1):
                         select_green_tokens = gr.Checkbox(label="Select 'greenlist' from partition", value=args.select_green_tokens)
+        gr.HTML("""
+                <p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
+                <br/>
+                <a href="https://huggingface.co/spaces/tomg-group-umd/lm-watermarking?duplicate=true">
+                <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+                <p/>
+                """)
+        # Register main generation tab click, outputing generations as well as a the encoded+redecoded+potentially truncated prompt and flag
+        generate_btn.click(fn=generate_partial, inputs=[prompt,session_args], outputs=[redecoded_input, truncation_warning, output_without_watermark, output_with_watermark,session_args])
+        # Show truncated version of prompt if truncation occurred
+        redecoded_input.change(fn=truncate_prompt, inputs=[redecoded_input,truncation_warning,prompt,session_args], outputs=[prompt,session_args])
+        # Call detection when the outputs (of the generate function) are updated
+        output_without_watermark.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
+        output_with_watermark.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
+        # Register main detection tab click
+        detect_btn.click(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_result, session_args])
+        # State management logic
+        # update callbacks that change the state dict
         def update_sampling_temp(session_state, value): session_state.sampling_temp = float(value); return session_state
         def update_generation_seed(session_state, value): session_state.generation_seed = int(value); return session_state
         def update_gamma(session_state, value): session_state.gamma = float(value); return session_state
         def update_delta(session_state, value): session_state.delta = float(value); return session_state
+        def update_detection_z_threshold(session_state, value): session_state.detection_z_threshold = float(value); return session_state
         def update_decoding(session_state, value):
             if value == "multinomial":
                 session_state.use_sampling = True
         def update_normalizers(session_state, value): session_state.normalizers = value; return session_state
         def update_seed_separately(session_state, value): session_state.seed_separately = value; return session_state
         def update_select_green_tokens(session_state, value): session_state.select_green_tokens = value; return session_state
+        # registering callbacks for toggling the visibilty of certain parameters
         decoding.change(toggle_sampling_vis,inputs=[decoding], outputs=[sampling_temp])
         decoding.change(toggle_sampling_vis,inputs=[decoding], outputs=[generation_seed])
         decoding.change(toggle_sampling_vis_inv,inputs=[decoding], outputs=[n_beams])
+        # registering all state update callbacks
         decoding.change(update_decoding,inputs=[session_args, decoding], outputs=[session_args])
         sampling_temp.change(update_sampling_temp,inputs=[session_args, sampling_temp], outputs=[session_args])
         generation_seed.change(update_generation_seed,inputs=[session_args, generation_seed], outputs=[session_args])
         max_new_tokens.change(update_max_new_tokens,inputs=[session_args, max_new_tokens], outputs=[session_args])
         gamma.change(update_gamma,inputs=[session_args, gamma], outputs=[session_args])
         delta.change(update_delta,inputs=[session_args, delta], outputs=[session_args])
+        detection_z_threshold.change(update_detection_z_threshold,inputs=[session_args, detection_z_threshold], outputs=[session_args])
         ignore_repeated_bigrams.change(update_ignore_repeated_bigrams,inputs=[session_args, ignore_repeated_bigrams], outputs=[session_args])
         normalizers.change(update_normalizers,inputs=[session_args, normalizers], outputs=[session_args])
         seed_separately.change(update_seed_separately,inputs=[session_args, seed_separately], outputs=[session_args])
         select_green_tokens.change(update_select_green_tokens,inputs=[session_args, select_green_tokens], outputs=[session_args])
+        # register additional callback on button clicks that updates the shown parameters window
         generate_btn.click(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
         detect_btn.click(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        # When the parameters change, display the update and fire detection, since some detection params dont change the model output.
+        gamma.change(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        gamma.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
+        gamma.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
+        gamma.change(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_input,session_args])
+        detection_z_threshold.change(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        detection_z_threshold.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
+        detection_z_threshold.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
+        detection_z_threshold.change(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_input,session_args])
+        ignore_repeated_bigrams.change(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        ignore_repeated_bigrams.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
+        ignore_repeated_bigrams.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
+        ignore_repeated_bigrams.change(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_input,session_args])
+        normalizers.change(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        normalizers.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
+        normalizers.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
+        normalizers.change(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_input,session_args])
+        select_green_tokens.change(lambda value: str(value), inputs=[session_args], outputs=[current_parameters])
+        select_green_tokens.change(fn=detect_partial, inputs=[output_without_watermark,session_args], outputs=[without_watermark_detection_result,session_args])
+        select_green_tokens.change(fn=detect_partial, inputs=[output_with_watermark,session_args], outputs=[with_watermark_detection_result,session_args])
+        select_green_tokens.change(fn=detect_partial, inputs=[detection_input,session_args], outputs=[detection_input,session_args])
     demo.queue(concurrency_count=3)

homoglyph_data/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# This is data for homoglyph finding
+"""Original package info:
+Homoglyphs
+* Get similar letters
+* Convert string to ASCII letters
+* Detect possible letter languages
+* Detect letter UTF-8 group.
+# main package info
+__title__ = 'Homoglyphs'
+__version__ = '2.0.4'
+__author__ = 'Gram Orsinium'
+__license__ = 'MIT'
+# License:
+MIT License 2019 orsinium <[email protected]>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""

homoglyph_data/categories.json ADDED Viewed

The diff for this file is too large to render. See raw diff

homoglyph_data/confusables_sept2022.json ADDED Viewed

The diff for this file is too large to render. See raw diff

homoglyph_data/languages.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+    "ar": "ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
+    "be": "ʼЁІЎАБВГДЕЖЗЙКЛМНОПРСТУФХЦЧШЫЬЭЮЯабвгдежзйклмнопрстуфхцчшыьэюяёіў",
+    "bg": "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
+    "ca": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÈÉÍÏÒÓÚÜÇàèéíïòóúüç·",
+    "cz": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÓÚÝáéíóúýČčĎďĚěŇňŘřŠšŤťŮůŽž",
+    "da": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÅÆØåæø",
+    "de": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜßäöü",
+    "el": "ΪΫΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΐΰϊϋάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ",
+    "en": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
+    "eo": "ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĈĉĜĝĤĥĴĵŜŝŬŭ",
+    "es": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÑÓÚÜáéíñóúü",
+    "et": "ABDEGHIJKLMNOPRSTUVabdeghijklmnoprstuvÄÕÖÜäõöü",
+    "fi": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÅÖäåöŠšŽž",
+    "fr": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÂÇÈÉÊÎÏÙÛàâçèéêîïùûŒœ",
+    "he": "אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
+    "hr": "ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĆćČčĐđŠšŽž",
+    "hu": "ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű",
+    "it": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÈÉÌÒÓÙàèéìòóù",
+    "lt": "ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzĄąČčĖėĘęĮįŠšŪūŲųŽž",
+    "lv": "ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž",
+    "mk": "ЃЅЈЉЊЌЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшѓѕјљњќџ",
+    "nl": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
+    "pl": "ABCDEFGHIJKLMNOPRSTUWYZabcdefghijklmnoprstuwyzÓóĄąĆćĘęŁłŃńŚśŹźŻż",
+    "pt": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÂÃÇÉÊÍÓÔÕÚàáâãçéêíóôõú",
+    "ro": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÂÎâîĂăȘșȚț",
+    "ru": "ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
+    "sk": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÄÉÍÓÔÚÝáäéíóôúýČčĎďĹĺĽľŇňŔŕŠšŤťŽž",
+    "sl": "ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzČčŠšŽž",
+    "sr": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzЂЈЉЊЋЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшђјљњћџ",
+    "th": "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
+    "tr": "ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş",
+    "vi": "ABCDEGHIKLMNOPQRSTUVXYabcdeghiklmnopqrstuvxyÂÊÔâêôĂăĐđƠơƯư"
+}

homoglyphs.py CHANGED Viewed

@@ -9,10 +9,6 @@ from itertools import product
 import os
 import unicodedata
-import homoglyphs_fork as hg
-CURRENT_DIR = hg.core.CURRENT_DIR
 # Actions if char not in alphabet
 STRATEGY_LOAD = 1  # load category for this char
 STRATEGY_IGNORE = 2  # add char to result
@@ -21,13 +17,17 @@ STRATEGY_REMOVE = 3  # remove char from result
 ASCII_RANGE = range(128)
 class Categories:
     """
     Work with aliases from ISO 15924.
     https://en.wikipedia.org/wiki/ISO_15924#List_of_codes
     """
-    fpath = os.path.join(CURRENT_DIR, "categories.json")
     @classmethod
     def _get_ranges(cls, categories):
@@ -70,8 +70,9 @@ class Categories:
         # try detect category by unicodedata
         try:
             category = unicodedata.name(char).split()[0]
-        except TypeError:
             # In Python2 unicodedata.name raise error for non-unicode chars
             pass
         else:
             if category in data["aliases"]:
@@ -91,7 +92,7 @@ class Categories:
 class Languages:
-    fpath = os.path.join(CURRENT_DIR, "languages.json")
     @classmethod
     def get_alphabet(cls, languages):
@@ -167,8 +168,7 @@ class Homoglyphs:
     @staticmethod
     def get_table(alphabet):
         table = defaultdict(set)
-        # removed CURRENT_DIR here:
-        with open(os.path.join("confusables_sept2022.json")) as f:
             data = json.load(f)
         for char in alphabet:
             if char in data:
@@ -180,8 +180,7 @@ class Homoglyphs:
     @staticmethod
     def get_restricted_table(source_alphabet, target_alphabet):
         table = defaultdict(set)
-        # removed CURRENT_DIR here:
-        with open(os.path.join("confusables_sept2022.json")) as f:
             data = json.load(f)
         for char in source_alphabet:
             if char in data:
@@ -244,9 +243,7 @@ class Homoglyphs:
             alt_chars = self._get_char_variants(char)
             if ascii:
-                alt_chars = [
-                    char for char in alt_chars if ord(char) in self.ascii_range
-                ]
                 if not alt_chars and self.ascii_strategy == STRATEGY_IGNORE:
                     return

 import os
 import unicodedata
 # Actions if char not in alphabet
 STRATEGY_LOAD = 1  # load category for this char
 STRATEGY_IGNORE = 2  # add char to result
 ASCII_RANGE = range(128)
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_LOCATION = os.path.join(CURRENT_DIR, "homoglyph_data")
 class Categories:
     """
     Work with aliases from ISO 15924.
     https://en.wikipedia.org/wiki/ISO_15924#List_of_codes
     """
+    fpath = os.path.join(DATA_LOCATION, "categories.json")
     @classmethod
     def _get_ranges(cls, categories):
         # try detect category by unicodedata
         try:
             category = unicodedata.name(char).split()[0]
+        except (TypeError, ValueError):
             # In Python2 unicodedata.name raise error for non-unicode chars
+            # Python3 raise ValueError for non-unicode characters
             pass
         else:
             if category in data["aliases"]:
 class Languages:
+    fpath = os.path.join(DATA_LOCATION, "languages.json")
     @classmethod
     def get_alphabet(cls, languages):
     @staticmethod
     def get_table(alphabet):
         table = defaultdict(set)
+        with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f:
             data = json.load(f)
         for char in alphabet:
             if char in data:
     @staticmethod
     def get_restricted_table(source_alphabet, target_alphabet):
         table = defaultdict(set)
+        with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f:
             data = json.load(f)
         for char in source_alphabet:
             if char in data:
             alt_chars = self._get_char_variants(char)
             if ascii:
+                alt_chars = [char for char in alt_chars if ord(char) in self.ascii_range]
                 if not alt_chars and self.ascii_strategy == STRATEGY_IGNORE:
                     return

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-homoglyphs_fork
 nltk
 scipy
 torch

 nltk
 scipy
 torch

watermark_processor.py CHANGED Viewed

@@ -216,6 +216,8 @@ class WatermarkDetector(WatermarkBase):
             score_dict.update(dict(num_tokens_scored=num_tokens_scored))
         if return_num_green_tokens:
             score_dict.update(dict(num_green_tokens=green_token_count))
         if return_z_score:
             score_dict.update(dict(z_score=self._compute_z_score(green_token_count, num_tokens_scored)))
         if return_p_value:
@@ -223,8 +225,6 @@ class WatermarkDetector(WatermarkBase):
             if z_score is None:
                 z_score = self._compute_z_score(green_token_count, num_tokens_scored)
             score_dict.update(dict(p_value=self._compute_p_value(z_score)))
-        if return_green_fraction:
-            score_dict.update(dict(green_fraction=(green_token_count / num_tokens_scored)))
         if return_green_token_mask:
             score_dict.update(dict(green_token_mask=green_token_mask))

             score_dict.update(dict(num_tokens_scored=num_tokens_scored))
         if return_num_green_tokens:
             score_dict.update(dict(num_green_tokens=green_token_count))
+        if return_green_fraction:
+            score_dict.update(dict(green_fraction=(green_token_count / num_tokens_scored)))
         if return_z_score:
             score_dict.update(dict(z_score=self._compute_z_score(green_token_count, num_tokens_scored)))
         if return_p_value:
             if z_score is None:
                 z_score = self._compute_z_score(green_token_count, num_tokens_scored)
             score_dict.update(dict(p_value=self._compute_p_value(z_score)))
         if return_green_token_mask:
             score_dict.update(dict(green_token_mask=green_token_mask))