Spaces:

loolootech
/

no-name-ner-th-demo

Running

App Files Files Community

no-name-ner-th-demo / app.py

atirut-name-looloo

Update app.py

0398229 verified 3 months ago

raw

history blame contribute delete

4.5 kB

	import gradio as gr
	from transformers import pipeline

	# Load NER model. Device -1 means CPU.
	# If you have a GPU, you can set device=0 or the appropriate GPU index.
	ner_model = pipeline("token-classification", model="loolootech/no-name-ner-th", device=-1)

	ENTITY_TO_ANONYMIZED_TOKEN_MAP = {
	"PERSON": "[PERSON]",
	"PHONE": "[PHONE]",
	"EMAIL": "[EMAIL]",
	"ADDRESS": "[LOCATION]",
	"DATE": "[DATE]",
	"NATIONAL_ID": "[NATIONAL_ID]",
	"HOSPITAL_IDS": "[HOSPITAL_IDS]",
	}

	def anonymize_text(original_text):
	"""
	Anonymizes sensitive entities in the input text using a named entity recognition (NER) model.

	Parameters
	----------
	original_text : str
	The text that may contain sensitive information such as names, phone numbers, emails, etc.

	Returns
	-------
	list
	A list containing:
	- original_text: the original input string
	- anonymized_text: the text with specified entities replaced by tokens
	- anonymized_entities: a list of dictionaries for each anonymized entity, containing
	the original word and its entity label

	Notes
	-----
	- Requires `ner_model` to be defined and initialized (e.g., a HuggingFace NER pipeline).
	- The mapping `ENTITY_TO_ANONYMIZED_TOKEN_MAP` defines which entity types will be replaced
	and what token will be used.
	"""

	# Step 1: Perform NER on the input text
	ner_results = ner_model(original_text)

	# Step 2: Combine overlapping or adjacent entities of the same type
	combined_entities = []
	for entity in ner_results:
	# Normalize entity label (e.g., "B-PERSON" -> "PERSON")
	entity_name = entity['entity'].split('-')[-1]
	entity['entity'] = entity_name

	# Add as new entity if list is empty, different type, or non-overlapping
	if not combined_entities or combined_entities[-1]['entity'] != entity_name or \
	combined_entities[-1]['start'] + len(combined_entities[-1]['word']) < entity['start']:
	combined_entities.append(entity)
	else:
	# Merge adjacent/overlapping entities of the same type
	combined_entities[-1]['word'] += ' ' + entity['word']
	combined_entities[-1]['end'] = entity['end']

	# Step 3: Filter entities that should be anonymized
	entities_to_anonymize = [
	e for e in combined_entities if e['entity'] in ENTITY_TO_ANONYMIZED_TOKEN_MAP.keys()
	]

	# Step 4: Sort entities in reverse order of start index to safely replace them
	entities_to_anonymize.sort(key=lambda x: x['start'], reverse=True)

	# Step 5: Replace each entity in the text with the corresponding anonymized token
	anonymized_text = original_text
	for entity in entities_to_anonymize:
	start, end = entity['start'], entity['end']
	token = ENTITY_TO_ANONYMIZED_TOKEN_MAP.get(entity['entity'])
	anonymized_text = anonymized_text[:start] + token + anonymized_text[end:]

	# Step 6: Return original text, anonymized text, and information about redacted entities
	return [
	original_text,
	anonymized_text,
	[{"word": e["word"], "label": e["entity"]} for e in entities_to_anonymize]
	]


	with gr.Blocks(title="Thai Clinical Conversation De-identification") as demo:
	gr.HTML(
	"""
	<div style="text-align: center;">
	<h1 style="font-size: 3em;">Thai Clinical Conversation De-identification</h1>
	<p style="font-size: 1.2em;">Paste Thai clinical or personal text below to redact sensitive info.</p>
	</div>
	"""
	)
	# Use a gr.Row with gr.Column spacers to center the image
	with gr.Row():
	gr.Column(scale=1) # Left spacer
	gr.Image(
	value="mascot-image-landscape.png", # Replace with your image URL/path
	width=200, # Set a smaller width for the image
	show_label=False,
	container=False # Prevent the image from being wrapped in a default Gradio container
	)
	gr.Column(scale=1) # Right spacer

	# Add the main interface components
	gr.Interface(
	fn=anonymize_text,
	inputs=gr.Textbox(lines=10, label="Input Text"),
	outputs=[
	gr.Textbox(label="Original Text"),
	gr.Textbox(label="Anonymized Text"),
	gr.JSON(label="Entities")
	],
	live=False, # Set live=False since we are using Blocks now
	)

	if __name__ == "__main__":
	demo.launch(share=False)