import gradio as gr from transformers import pipeline # Load NER model. Device -1 means CPU. # If you have a GPU, you can set device=0 or the appropriate GPU index. ner_model = pipeline("token-classification", model="loolootech/no-name-ner-th", device=-1) ENTITY_TO_ANONYMIZED_TOKEN_MAP = { "PERSON": "[PERSON]", "PHONE": "[PHONE]", "EMAIL": "[EMAIL]", "ADDRESS": "[LOCATION]", "DATE": "[DATE]", "NATIONAL_ID": "[NATIONAL_ID]", "HOSPITAL_IDS": "[HOSPITAL_IDS]", } def anonymize_text(original_text): """ Anonymizes sensitive entities in the input text using a named entity recognition (NER) model. Parameters ---------- original_text : str The text that may contain sensitive information such as names, phone numbers, emails, etc. Returns ------- list A list containing: - original_text: the original input string - anonymized_text: the text with specified entities replaced by tokens - anonymized_entities: a list of dictionaries for each anonymized entity, containing the original word and its entity label Notes ----- - Requires `ner_model` to be defined and initialized (e.g., a HuggingFace NER pipeline). - The mapping `ENTITY_TO_ANONYMIZED_TOKEN_MAP` defines which entity types will be replaced and what token will be used. """ # Step 1: Perform NER on the input text ner_results = ner_model(original_text) # Step 2: Combine overlapping or adjacent entities of the same type combined_entities = [] for entity in ner_results: # Normalize entity label (e.g., "B-PERSON" -> "PERSON") entity_name = entity['entity'].split('-')[-1] entity['entity'] = entity_name # Add as new entity if list is empty, different type, or non-overlapping if not combined_entities or combined_entities[-1]['entity'] != entity_name or \ combined_entities[-1]['start'] + len(combined_entities[-1]['word']) < entity['start']: combined_entities.append(entity) else: # Merge adjacent/overlapping entities of the same type combined_entities[-1]['word'] += ' ' + entity['word'] combined_entities[-1]['end'] = entity['end'] # Step 3: Filter entities that should be anonymized entities_to_anonymize = [ e for e in combined_entities if e['entity'] in ENTITY_TO_ANONYMIZED_TOKEN_MAP.keys() ] # Step 4: Sort entities in reverse order of start index to safely replace them entities_to_anonymize.sort(key=lambda x: x['start'], reverse=True) # Step 5: Replace each entity in the text with the corresponding anonymized token anonymized_text = original_text for entity in entities_to_anonymize: start, end = entity['start'], entity['end'] token = ENTITY_TO_ANONYMIZED_TOKEN_MAP.get(entity['entity']) anonymized_text = anonymized_text[:start] + token + anonymized_text[end:] # Step 6: Return original text, anonymized text, and information about redacted entities return [ original_text, anonymized_text, [{"word": e["word"], "label": e["entity"]} for e in entities_to_anonymize] ] with gr.Blocks(title="Thai Clinical Conversation De-identification") as demo: gr.HTML( """

Thai Clinical Conversation De-identification

Paste Thai clinical or personal text below to redact sensitive info.

""" ) # Use a gr.Row with gr.Column spacers to center the image with gr.Row(): gr.Column(scale=1) # Left spacer gr.Image( value="mascot-image-landscape.png", # Replace with your image URL/path width=200, # Set a smaller width for the image show_label=False, container=False # Prevent the image from being wrapped in a default Gradio container ) gr.Column(scale=1) # Right spacer # Add the main interface components gr.Interface( fn=anonymize_text, inputs=gr.Textbox(lines=10, label="Input Text"), outputs=[ gr.Textbox(label="Original Text"), gr.Textbox(label="Anonymized Text"), gr.JSON(label="Entities") ], live=False, # Set live=False since we are using Blocks now ) if __name__ == "__main__": demo.launch(share=False)