atirut-name-looloo commited on
Commit
f6ccd60
·
verified ·
1 Parent(s): b31b8a4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -0
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+
4
+ # Load NER model. Device -1 means CPU.
5
+ # If you have a GPU, you can set device=0 or the appropriate GPU index.
6
+ ner_model = pipeline("token-classification", model="loolootech/no-name-ner-th", device=-1)
7
+
8
+ ENTITY_TO_ANONYMIZED_TOKEN_MAP = {
9
+ "PERSON": "[PERSON]",
10
+ "PHONE": "[PHONE]",
11
+ "EMAIL": "[EMAIL]",
12
+ "ADDRESS": "[LOCATION]",
13
+ "DATE": "[DATE]",
14
+ "NATIONAL_ID": "[NATIONAL_ID]",
15
+ "HOSPITAL_IDS": "[HOSPITAL_IDS]",
16
+ }
17
+
18
+ def anonymize_text(original_text):
19
+ """
20
+ Anonymizes sensitive entities in the input text using a named entity recognition (NER) model.
21
+
22
+ Parameters
23
+ ----------
24
+ original_text : str
25
+ The text that may contain sensitive information such as names, phone numbers, emails, etc.
26
+
27
+ Returns
28
+ -------
29
+ list
30
+ A list containing:
31
+ - original_text: the original input string
32
+ - anonymized_text: the text with specified entities replaced by tokens
33
+ - anonymized_entities: a list of dictionaries for each anonymized entity, containing
34
+ the original word and its entity label
35
+
36
+ Notes
37
+ -----
38
+ - Requires `ner_model` to be defined and initialized (e.g., a HuggingFace NER pipeline).
39
+ - The mapping `ENTITY_TO_ANONYMIZED_TOKEN_MAP` defines which entity types will be replaced
40
+ and what token will be used.
41
+ """
42
+
43
+ # Step 1: Perform NER on the input text
44
+ ner_results = ner_model(original_text)
45
+
46
+ # Step 2: Combine overlapping or adjacent entities of the same type
47
+ combined_entities = []
48
+ for entity in ner_results:
49
+ # Normalize entity label (e.g., "B-PERSON" -> "PERSON")
50
+ entity_name = entity['entity'].split('-')[-1]
51
+ entity['entity'] = entity_name
52
+
53
+ # Add as new entity if list is empty, different type, or non-overlapping
54
+ if not combined_entities or combined_entities[-1]['entity'] != entity_name or \
55
+ combined_entities[-1]['start'] + len(combined_entities[-1]['word']) < entity['start']:
56
+ combined_entities.append(entity)
57
+ else:
58
+ # Merge adjacent/overlapping entities of the same type
59
+ combined_entities[-1]['word'] += ' ' + entity['word']
60
+ combined_entities[-1]['end'] = entity['end']
61
+
62
+ # Step 3: Filter entities that should be anonymized
63
+ entities_to_anonymize = [
64
+ e for e in combined_entities if e['entity'] in ENTITY_TO_ANONYMIZED_TOKEN_MAP.keys()
65
+ ]
66
+
67
+ # Step 4: Sort entities in reverse order of start index to safely replace them
68
+ entities_to_anonymize.sort(key=lambda x: x['start'], reverse=True)
69
+
70
+ # Step 5: Replace each entity in the text with the corresponding anonymized token
71
+ anonymized_text = original_text
72
+ for entity in entities_to_anonymize:
73
+ start, end = entity['start'], entity['end']
74
+ token = ENTITY_TO_ANONYMIZED_TOKEN_MAP.get(entity['entity'])
75
+ anonymized_text = anonymized_text[:start] + token + anonymized_text[end:]
76
+
77
+ # Step 6: Return original text, anonymized text, and information about redacted entities
78
+ return [
79
+ original_text,
80
+ anonymized_text,
81
+ [{"word": e["word"], "label": e["entity"]} for e in entities_to_anonymize]
82
+ ]
83
+
84
+
85
+ with gr.Blocks(title="Thai Clinical Conversation De-identification") as demo:
86
+ gr.HTML(
87
+ """
88
+ <div style="text-align: center;">
89
+ <h1 style="font-size: 3em;">Thai Clinical Conversation De-identification</h1>
90
+ <p style="font-size: 1.2em;">Paste Thai clinical or personal text below to redact sensitive info.</p>
91
+ </div>
92
+ """
93
+ )
94
+ # Use a gr.Row with gr.Column spacers to center the image
95
+ with gr.Row():
96
+ gr.Column(scale=1) # Left spacer
97
+ gr.Image(
98
+ value="assets/mascot-image-landscape.png", # Replace with your image URL/path
99
+ width=200, # Set a smaller width for the image
100
+ show_label=False,
101
+ container=False # Prevent the image from being wrapped in a default Gradio container
102
+ )
103
+ gr.Column(scale=1) # Right spacer
104
+
105
+ # Add the main interface components
106
+ gr.Interface(
107
+ fn=anonymize_text,
108
+ inputs=gr.Textbox(lines=10, label="Input Text"),
109
+ outputs=[
110
+ gr.Textbox(label="Original Text"),
111
+ gr.Textbox(label="Anonymized Text"),
112
+ gr.JSON(label="Entities")
113
+ ],
114
+ live=False, # Set live=False since we are using Blocks now
115
+ )
116
+
117
+ if __name__ == "__main__":
118
+ demo.launch(share=False)