Spaces:
Running
Running
File size: 4,495 Bytes
f6ccd60 0398229 f6ccd60 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import gradio as gr
from transformers import pipeline
# Load NER model. Device -1 means CPU.
# If you have a GPU, you can set device=0 or the appropriate GPU index.
ner_model = pipeline("token-classification", model="loolootech/no-name-ner-th", device=-1)
ENTITY_TO_ANONYMIZED_TOKEN_MAP = {
"PERSON": "[PERSON]",
"PHONE": "[PHONE]",
"EMAIL": "[EMAIL]",
"ADDRESS": "[LOCATION]",
"DATE": "[DATE]",
"NATIONAL_ID": "[NATIONAL_ID]",
"HOSPITAL_IDS": "[HOSPITAL_IDS]",
}
def anonymize_text(original_text):
"""
Anonymizes sensitive entities in the input text using a named entity recognition (NER) model.
Parameters
----------
original_text : str
The text that may contain sensitive information such as names, phone numbers, emails, etc.
Returns
-------
list
A list containing:
- original_text: the original input string
- anonymized_text: the text with specified entities replaced by tokens
- anonymized_entities: a list of dictionaries for each anonymized entity, containing
the original word and its entity label
Notes
-----
- Requires `ner_model` to be defined and initialized (e.g., a HuggingFace NER pipeline).
- The mapping `ENTITY_TO_ANONYMIZED_TOKEN_MAP` defines which entity types will be replaced
and what token will be used.
"""
# Step 1: Perform NER on the input text
ner_results = ner_model(original_text)
# Step 2: Combine overlapping or adjacent entities of the same type
combined_entities = []
for entity in ner_results:
# Normalize entity label (e.g., "B-PERSON" -> "PERSON")
entity_name = entity['entity'].split('-')[-1]
entity['entity'] = entity_name
# Add as new entity if list is empty, different type, or non-overlapping
if not combined_entities or combined_entities[-1]['entity'] != entity_name or \
combined_entities[-1]['start'] + len(combined_entities[-1]['word']) < entity['start']:
combined_entities.append(entity)
else:
# Merge adjacent/overlapping entities of the same type
combined_entities[-1]['word'] += ' ' + entity['word']
combined_entities[-1]['end'] = entity['end']
# Step 3: Filter entities that should be anonymized
entities_to_anonymize = [
e for e in combined_entities if e['entity'] in ENTITY_TO_ANONYMIZED_TOKEN_MAP.keys()
]
# Step 4: Sort entities in reverse order of start index to safely replace them
entities_to_anonymize.sort(key=lambda x: x['start'], reverse=True)
# Step 5: Replace each entity in the text with the corresponding anonymized token
anonymized_text = original_text
for entity in entities_to_anonymize:
start, end = entity['start'], entity['end']
token = ENTITY_TO_ANONYMIZED_TOKEN_MAP.get(entity['entity'])
anonymized_text = anonymized_text[:start] + token + anonymized_text[end:]
# Step 6: Return original text, anonymized text, and information about redacted entities
return [
original_text,
anonymized_text,
[{"word": e["word"], "label": e["entity"]} for e in entities_to_anonymize]
]
with gr.Blocks(title="Thai Clinical Conversation De-identification") as demo:
gr.HTML(
"""
<div style="text-align: center;">
<h1 style="font-size: 3em;">Thai Clinical Conversation De-identification</h1>
<p style="font-size: 1.2em;">Paste Thai clinical or personal text below to redact sensitive info.</p>
</div>
"""
)
# Use a gr.Row with gr.Column spacers to center the image
with gr.Row():
gr.Column(scale=1) # Left spacer
gr.Image(
value="mascot-image-landscape.png", # Replace with your image URL/path
width=200, # Set a smaller width for the image
show_label=False,
container=False # Prevent the image from being wrapped in a default Gradio container
)
gr.Column(scale=1) # Right spacer
# Add the main interface components
gr.Interface(
fn=anonymize_text,
inputs=gr.Textbox(lines=10, label="Input Text"),
outputs=[
gr.Textbox(label="Original Text"),
gr.Textbox(label="Anonymized Text"),
gr.JSON(label="Entities")
],
live=False, # Set live=False since we are using Blocks now
)
if __name__ == "__main__":
demo.launch(share=False)
|