atirut-name-looloo's picture
Update app.py
0398229 verified
import gradio as gr
from transformers import pipeline
# Load NER model. Device -1 means CPU.
# If you have a GPU, you can set device=0 or the appropriate GPU index.
ner_model = pipeline("token-classification", model="loolootech/no-name-ner-th", device=-1)
ENTITY_TO_ANONYMIZED_TOKEN_MAP = {
"PERSON": "[PERSON]",
"PHONE": "[PHONE]",
"EMAIL": "[EMAIL]",
"ADDRESS": "[LOCATION]",
"DATE": "[DATE]",
"NATIONAL_ID": "[NATIONAL_ID]",
"HOSPITAL_IDS": "[HOSPITAL_IDS]",
}
def anonymize_text(original_text):
"""
Anonymizes sensitive entities in the input text using a named entity recognition (NER) model.
Parameters
----------
original_text : str
The text that may contain sensitive information such as names, phone numbers, emails, etc.
Returns
-------
list
A list containing:
- original_text: the original input string
- anonymized_text: the text with specified entities replaced by tokens
- anonymized_entities: a list of dictionaries for each anonymized entity, containing
the original word and its entity label
Notes
-----
- Requires `ner_model` to be defined and initialized (e.g., a HuggingFace NER pipeline).
- The mapping `ENTITY_TO_ANONYMIZED_TOKEN_MAP` defines which entity types will be replaced
and what token will be used.
"""
# Step 1: Perform NER on the input text
ner_results = ner_model(original_text)
# Step 2: Combine overlapping or adjacent entities of the same type
combined_entities = []
for entity in ner_results:
# Normalize entity label (e.g., "B-PERSON" -> "PERSON")
entity_name = entity['entity'].split('-')[-1]
entity['entity'] = entity_name
# Add as new entity if list is empty, different type, or non-overlapping
if not combined_entities or combined_entities[-1]['entity'] != entity_name or \
combined_entities[-1]['start'] + len(combined_entities[-1]['word']) < entity['start']:
combined_entities.append(entity)
else:
# Merge adjacent/overlapping entities of the same type
combined_entities[-1]['word'] += ' ' + entity['word']
combined_entities[-1]['end'] = entity['end']
# Step 3: Filter entities that should be anonymized
entities_to_anonymize = [
e for e in combined_entities if e['entity'] in ENTITY_TO_ANONYMIZED_TOKEN_MAP.keys()
]
# Step 4: Sort entities in reverse order of start index to safely replace them
entities_to_anonymize.sort(key=lambda x: x['start'], reverse=True)
# Step 5: Replace each entity in the text with the corresponding anonymized token
anonymized_text = original_text
for entity in entities_to_anonymize:
start, end = entity['start'], entity['end']
token = ENTITY_TO_ANONYMIZED_TOKEN_MAP.get(entity['entity'])
anonymized_text = anonymized_text[:start] + token + anonymized_text[end:]
# Step 6: Return original text, anonymized text, and information about redacted entities
return [
original_text,
anonymized_text,
[{"word": e["word"], "label": e["entity"]} for e in entities_to_anonymize]
]
with gr.Blocks(title="Thai Clinical Conversation De-identification") as demo:
gr.HTML(
"""
<div style="text-align: center;">
<h1 style="font-size: 3em;">Thai Clinical Conversation De-identification</h1>
<p style="font-size: 1.2em;">Paste Thai clinical or personal text below to redact sensitive info.</p>
</div>
"""
)
# Use a gr.Row with gr.Column spacers to center the image
with gr.Row():
gr.Column(scale=1) # Left spacer
gr.Image(
value="mascot-image-landscape.png", # Replace with your image URL/path
width=200, # Set a smaller width for the image
show_label=False,
container=False # Prevent the image from being wrapped in a default Gradio container
)
gr.Column(scale=1) # Right spacer
# Add the main interface components
gr.Interface(
fn=anonymize_text,
inputs=gr.Textbox(lines=10, label="Input Text"),
outputs=[
gr.Textbox(label="Original Text"),
gr.Textbox(label="Anonymized Text"),
gr.JSON(label="Entities")
],
live=False, # Set live=False since we are using Blocks now
)
if __name__ == "__main__":
demo.launch(share=False)