Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Upload 3 files
Browse files
    	
        transformers_rec/configuration.py
    CHANGED
    
    | @@ -1,3 +1,5 @@ | |
|  | |
|  | |
| 1 | 
             
            STANFORD_COFIGURATION = {
         | 
| 2 | 
             
                "DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
         | 
| 3 | 
             
                "PRESIDIO_SUPPORTED_ENTITIES": [
         | 
| @@ -11,7 +13,8 @@ STANFORD_COFIGURATION = { | |
| 11 | 
             
                    "DEVICE",
         | 
| 12 | 
             
                    "ZIP",
         | 
| 13 | 
             
                    "PROFESSION",
         | 
| 14 | 
            -
                    "USERNAME"
         | 
|  | |
| 15 |  | 
| 16 | 
             
                ],
         | 
| 17 | 
             
                "LABELS_TO_IGNORE": ["O"],
         | 
| @@ -22,8 +25,8 @@ STANFORD_COFIGURATION = { | |
| 22 | 
             
                    "DOCTOR": "PERSON",
         | 
| 23 | 
             
                    "PATIENT": "PERSON",
         | 
| 24 | 
             
                    "HOSPITAL": "LOCATION",
         | 
| 25 | 
            -
                    "MEDICALRECORD": " | 
| 26 | 
            -
                    "IDNUM": " | 
| 27 | 
             
                    "ORGANIZATION": "ORGANIZATION",
         | 
| 28 | 
             
                    "ZIP": "ZIP",
         | 
| 29 | 
             
                    "PHONE": "PHONE_NUMBER",
         | 
| @@ -55,6 +58,8 @@ STANFORD_COFIGURATION = { | |
| 55 | 
             
                },
         | 
| 56 | 
             
                "CHUNK_OVERLAP_SIZE": 40,
         | 
| 57 | 
             
                "CHUNK_SIZE": 600,
         | 
|  | |
|  | |
| 58 | 
             
            }
         | 
| 59 |  | 
| 60 |  | 
| @@ -70,6 +75,7 @@ BERT_DEID_CONFIGURATION = { | |
| 70 | 
             
                    "ZIP",
         | 
| 71 | 
             
                    "PROFESSION",
         | 
| 72 | 
             
                    "USERNAME",
         | 
|  | |
| 73 | 
             
                ],
         | 
| 74 | 
             
                "DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
         | 
| 75 | 
             
                "LABELS_TO_IGNORE": ["O"],
         | 
| @@ -102,7 +108,7 @@ BERT_DEID_CONFIGURATION = { | |
| 102 | 
             
                    "LOC": "LOCATION",
         | 
| 103 | 
             
                    "ORG": "ORGANIZATION",
         | 
| 104 | 
             
                    "AGE": "AGE",
         | 
| 105 | 
            -
                    "ID": " | 
| 106 | 
             
                    "EMAIL": "EMAIL",
         | 
| 107 | 
             
                    "PATIENT": "PERSON",
         | 
| 108 | 
             
                    "STAFF": "PERSON",
         | 
| @@ -113,4 +119,6 @@ BERT_DEID_CONFIGURATION = { | |
| 113 | 
             
                },
         | 
| 114 | 
             
                "CHUNK_OVERLAP_SIZE": 40,
         | 
| 115 | 
             
                "CHUNK_SIZE": 600,
         | 
|  | |
|  | |
| 116 | 
             
            }
         | 
|  | |
| 1 | 
            +
            ## Taken from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/configuration.py
         | 
| 2 | 
            +
             | 
| 3 | 
             
            STANFORD_COFIGURATION = {
         | 
| 4 | 
             
                "DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
         | 
| 5 | 
             
                "PRESIDIO_SUPPORTED_ENTITIES": [
         | 
|  | |
| 13 | 
             
                    "DEVICE",
         | 
| 14 | 
             
                    "ZIP",
         | 
| 15 | 
             
                    "PROFESSION",
         | 
| 16 | 
            +
                    "USERNAME",
         | 
| 17 | 
            +
                    "ID"
         | 
| 18 |  | 
| 19 | 
             
                ],
         | 
| 20 | 
             
                "LABELS_TO_IGNORE": ["O"],
         | 
|  | |
| 25 | 
             
                    "DOCTOR": "PERSON",
         | 
| 26 | 
             
                    "PATIENT": "PERSON",
         | 
| 27 | 
             
                    "HOSPITAL": "LOCATION",
         | 
| 28 | 
            +
                    "MEDICALRECORD": "ID",
         | 
| 29 | 
            +
                    "IDNUM": "ID",
         | 
| 30 | 
             
                    "ORGANIZATION": "ORGANIZATION",
         | 
| 31 | 
             
                    "ZIP": "ZIP",
         | 
| 32 | 
             
                    "PHONE": "PHONE_NUMBER",
         | 
|  | |
| 58 | 
             
                },
         | 
| 59 | 
             
                "CHUNK_OVERLAP_SIZE": 40,
         | 
| 60 | 
             
                "CHUNK_SIZE": 600,
         | 
| 61 | 
            +
                "ID_SCORE_MULTIPLIER": 0.4,
         | 
| 62 | 
            +
                "ID_ENTITY_NAME": "ID"
         | 
| 63 | 
             
            }
         | 
| 64 |  | 
| 65 |  | 
|  | |
| 75 | 
             
                    "ZIP",
         | 
| 76 | 
             
                    "PROFESSION",
         | 
| 77 | 
             
                    "USERNAME",
         | 
| 78 | 
            +
                    "ID"
         | 
| 79 | 
             
                ],
         | 
| 80 | 
             
                "DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
         | 
| 81 | 
             
                "LABELS_TO_IGNORE": ["O"],
         | 
|  | |
| 108 | 
             
                    "LOC": "LOCATION",
         | 
| 109 | 
             
                    "ORG": "ORGANIZATION",
         | 
| 110 | 
             
                    "AGE": "AGE",
         | 
| 111 | 
            +
                    "ID": "ID",
         | 
| 112 | 
             
                    "EMAIL": "EMAIL",
         | 
| 113 | 
             
                    "PATIENT": "PERSON",
         | 
| 114 | 
             
                    "STAFF": "PERSON",
         | 
|  | |
| 119 | 
             
                },
         | 
| 120 | 
             
                "CHUNK_OVERLAP_SIZE": 40,
         | 
| 121 | 
             
                "CHUNK_SIZE": 600,
         | 
| 122 | 
            +
                "ID_SCORE_MULTIPLIER": 0.4,
         | 
| 123 | 
            +
                "ID_ENTITY_NAME": "ID"
         | 
| 124 | 
             
            }
         | 
    	
        transformers_rec/transformers_recognizer.py
    CHANGED
    
    | @@ -1,3 +1,5 @@ | |
|  | |
|  | |
| 1 | 
             
            import copy
         | 
| 2 | 
             
            import logging
         | 
| 3 | 
             
            from typing import Optional, List
         | 
| @@ -90,6 +92,8 @@ class TransformersRecognizer(EntityRecognizer): | |
| 90 | 
             
                    self.default_explanation = None
         | 
| 91 | 
             
                    self.text_overlap_length = None
         | 
| 92 | 
             
                    self.chunk_length = None
         | 
|  | |
|  | |
| 93 |  | 
| 94 | 
             
                def load_transformer(self, **kwargs) -> None:
         | 
| 95 | 
             
                    """Load external configuration parameters and set default values.
         | 
| @@ -104,6 +108,8 @@ class TransformersRecognizer(EntityRecognizer): | |
| 104 | 
             
                    **CHUNK_SIZE (int) - number of characters in each chunk of text
         | 
| 105 | 
             
                    **LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
         | 
| 106 | 
             
                    **DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
         | 
|  | |
|  | |
| 107 | 
             
                    """
         | 
| 108 |  | 
| 109 | 
             
                    self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
         | 
| @@ -113,6 +119,9 @@ class TransformersRecognizer(EntityRecognizer): | |
| 113 | 
             
                    self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
         | 
| 114 | 
             
                    self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
         | 
| 115 | 
             
                    self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
         | 
|  | |
|  | |
|  | |
| 116 | 
             
                    if not self.pipeline:
         | 
| 117 | 
             
                        if not self.model_path:
         | 
| 118 | 
             
                            self.model_path = "obi/deid_roberta_i2b2"
         | 
| @@ -165,11 +174,14 @@ class TransformersRecognizer(EntityRecognizer): | |
| 165 | 
             
                    ner_results = self._get_ner_results_for_text(text)
         | 
| 166 |  | 
| 167 | 
             
                    for res in ner_results:
         | 
| 168 | 
            -
                         | 
| 169 | 
            -
                        if not  | 
| 170 | 
             
                            continue
         | 
| 171 |  | 
| 172 | 
            -
                        res["entity_group"]  | 
|  | |
|  | |
|  | |
| 173 | 
             
                        textual_explanation = self.default_explanation.format(res["entity_group"])
         | 
| 174 | 
             
                        explanation = self.build_transformers_explanation(
         | 
| 175 | 
             
                            float(round(res["score"], 2)), textual_explanation, res["word"]
         | 
| @@ -224,33 +236,32 @@ class TransformersRecognizer(EntityRecognizer): | |
| 224 | 
             
                    model_max_length = self.pipeline.tokenizer.model_max_length
         | 
| 225 | 
             
                    # calculate inputs based on the text
         | 
| 226 | 
             
                    text_length = len(text)
         | 
| 227 | 
            -
                     | 
| 228 | 
            -
                    if text_length  | 
| 229 | 
            -
                         | 
|  | |
| 230 | 
             
                        logger.info(
         | 
| 231 | 
            -
                            f"splitting the text into chunks, length {text_length} > {model_max_length | 
| 232 | 
             
                        )
         | 
| 233 | 
            -
             | 
| 234 | 
             
                        chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
         | 
| 235 | 
             
                            text_length, self.chunk_length, self.text_overlap_length
         | 
| 236 | 
            -
             | 
| 237 | 
            -
                    else:
         | 
| 238 | 
            -
                        chunk_indexes = [[0, text_length]]
         | 
| 239 |  | 
| 240 | 
            -
             | 
| 241 | 
            -
             | 
| 242 | 
            -
             | 
| 243 | 
            -
             | 
| 244 |  | 
| 245 | 
            -
             | 
| 246 | 
            -
             | 
| 247 | 
            -
             | 
| 248 | 
            -
             | 
| 249 | 
            -
             | 
| 250 | 
            -
             | 
| 251 | 
            -
             | 
| 252 |  | 
| 253 | 
            -
             | 
| 254 |  | 
| 255 | 
             
                    # remove duplicates
         | 
| 256 | 
             
                    predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
         | 
| @@ -302,27 +313,24 @@ class TransformersRecognizer(EntityRecognizer): | |
| 302 | 
             
                    )
         | 
| 303 | 
             
                    return explanation
         | 
| 304 |  | 
| 305 | 
            -
                def __check_label_transformer(self, label: str) -> str:
         | 
| 306 | 
             
                    """The function validates the predicted label is identified by Presidio
         | 
| 307 | 
             
                    and maps the string into a Presidio representation
         | 
| 308 | 
             
                    :param label: Predicted label by the model
         | 
| 309 | 
            -
                    : | 
| 310 | 
            -
                    :return: Returns the predicted entity if the label is found in model_to_presidio mapping dictionary
         | 
| 311 | 
            -
                    and is supported by Presidio entities
         | 
| 312 | 
            -
                    :rtype: str
         | 
| 313 | 
             
                    """
         | 
| 314 |  | 
| 315 | 
            -
                    if label == "O":
         | 
| 316 | 
            -
                        return label
         | 
| 317 | 
            -
             | 
| 318 | 
             
                    # convert model label to presidio label
         | 
| 319 | 
             
                    entity = self.model_to_presidio_mapping.get(label, None)
         | 
| 320 |  | 
|  | |
|  | |
|  | |
| 321 | 
             
                    if entity is None:
         | 
| 322 | 
            -
                        logger.warning(f"Found unrecognized label {label}, returning entity as  | 
| 323 | 
            -
                        return  | 
| 324 |  | 
| 325 | 
             
                    if entity not in self.supported_entities:
         | 
| 326 | 
             
                        logger.warning(f"Found entity {entity} which is not supported by Presidio")
         | 
| 327 | 
            -
                        return  | 
| 328 | 
             
                    return entity
         | 
|  | |
| 1 | 
            +
            # Modified from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/transformer_recognizer.py
         | 
| 2 | 
            +
             | 
| 3 | 
             
            import copy
         | 
| 4 | 
             
            import logging
         | 
| 5 | 
             
            from typing import Optional, List
         | 
|  | |
| 92 | 
             
                    self.default_explanation = None
         | 
| 93 | 
             
                    self.text_overlap_length = None
         | 
| 94 | 
             
                    self.chunk_length = None
         | 
| 95 | 
            +
                    self.id_entity_name = None
         | 
| 96 | 
            +
                    self.id_score_reduction = None
         | 
| 97 |  | 
| 98 | 
             
                def load_transformer(self, **kwargs) -> None:
         | 
| 99 | 
             
                    """Load external configuration parameters and set default values.
         | 
|  | |
| 108 | 
             
                    **CHUNK_SIZE (int) - number of characters in each chunk of text
         | 
| 109 | 
             
                    **LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
         | 
| 110 | 
             
                    **DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
         | 
| 111 | 
            +
                    **ID_ENTITY_NAME (str) - name of the ID entity
         | 
| 112 | 
            +
                    **ID_SCORE_REDUCTION (float) - score multiplier for ID entities
         | 
| 113 | 
             
                    """
         | 
| 114 |  | 
| 115 | 
             
                    self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
         | 
|  | |
| 119 | 
             
                    self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
         | 
| 120 | 
             
                    self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
         | 
| 121 | 
             
                    self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
         | 
| 122 | 
            +
                    self.id_entity_name = kwargs.get("ID_ENTITY_NAME", "ID")
         | 
| 123 | 
            +
                    self.id_score_reduction = kwargs.get("ID_SCORE_REDUCTION", 0.5)
         | 
| 124 | 
            +
             | 
| 125 | 
             
                    if not self.pipeline:
         | 
| 126 | 
             
                        if not self.model_path:
         | 
| 127 | 
             
                            self.model_path = "obi/deid_roberta_i2b2"
         | 
|  | |
| 174 | 
             
                    ner_results = self._get_ner_results_for_text(text)
         | 
| 175 |  | 
| 176 | 
             
                    for res in ner_results:
         | 
| 177 | 
            +
                        res["entity_group"] = self.__check_label_transformer(res["entity_group"])
         | 
| 178 | 
            +
                        if not res["entity_group"]:
         | 
| 179 | 
             
                            continue
         | 
| 180 |  | 
| 181 | 
            +
                        if res["entity_group"] == self.id_entity_name:
         | 
| 182 | 
            +
                            print(f"ID entity found, multiplying score by {self.id_score_reduction}")
         | 
| 183 | 
            +
                            res["score"] = res["score"] * self.id_score_reduction
         | 
| 184 | 
            +
             | 
| 185 | 
             
                        textual_explanation = self.default_explanation.format(res["entity_group"])
         | 
| 186 | 
             
                        explanation = self.build_transformers_explanation(
         | 
| 187 | 
             
                            float(round(res["score"], 2)), textual_explanation, res["word"]
         | 
|  | |
| 236 | 
             
                    model_max_length = self.pipeline.tokenizer.model_max_length
         | 
| 237 | 
             
                    # calculate inputs based on the text
         | 
| 238 | 
             
                    text_length = len(text)
         | 
| 239 | 
            +
                    # split text into chunks
         | 
| 240 | 
            +
                    if text_length <= model_max_length:
         | 
| 241 | 
            +
                        predictions = self.pipeline(text)
         | 
| 242 | 
            +
                    else:
         | 
| 243 | 
             
                        logger.info(
         | 
| 244 | 
            +
                            f"splitting the text into chunks, length {text_length} > {model_max_length}"
         | 
| 245 | 
             
                        )
         | 
| 246 | 
            +
                        predictions = list()
         | 
| 247 | 
             
                        chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
         | 
| 248 | 
             
                            text_length, self.chunk_length, self.text_overlap_length
         | 
| 249 | 
            +
                            )
         | 
|  | |
|  | |
| 250 |  | 
| 251 | 
            +
                        # iterate over text chunks and run inference
         | 
| 252 | 
            +
                        for chunk_start, chunk_end in chunk_indexes:
         | 
| 253 | 
            +
                            chunk_text = text[chunk_start:chunk_end]
         | 
| 254 | 
            +
                            chunk_preds = self.pipeline(chunk_text)
         | 
| 255 |  | 
| 256 | 
            +
                            # align indexes to match the original text - add to each position the value of chunk_start
         | 
| 257 | 
            +
                            aligned_predictions = list()
         | 
| 258 | 
            +
                            for prediction in chunk_preds:
         | 
| 259 | 
            +
                                prediction_tmp = copy.deepcopy(prediction)
         | 
| 260 | 
            +
                                prediction_tmp["start"] += chunk_start
         | 
| 261 | 
            +
                                prediction_tmp["end"] += chunk_start
         | 
| 262 | 
            +
                                aligned_predictions.append(prediction_tmp)
         | 
| 263 |  | 
| 264 | 
            +
                            predictions.extend(aligned_predictions)
         | 
| 265 |  | 
| 266 | 
             
                    # remove duplicates
         | 
| 267 | 
             
                    predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
         | 
|  | |
| 313 | 
             
                    )
         | 
| 314 | 
             
                    return explanation
         | 
| 315 |  | 
| 316 | 
            +
                def __check_label_transformer(self, label: str) -> Optional[str]:
         | 
| 317 | 
             
                    """The function validates the predicted label is identified by Presidio
         | 
| 318 | 
             
                    and maps the string into a Presidio representation
         | 
| 319 | 
             
                    :param label: Predicted label by the model
         | 
| 320 | 
            +
                    :return: Returns the adjusted entity name
         | 
|  | |
|  | |
|  | |
| 321 | 
             
                    """
         | 
| 322 |  | 
|  | |
|  | |
|  | |
| 323 | 
             
                    # convert model label to presidio label
         | 
| 324 | 
             
                    entity = self.model_to_presidio_mapping.get(label, None)
         | 
| 325 |  | 
| 326 | 
            +
                    if entity in self.ignore_labels:
         | 
| 327 | 
            +
                        return None
         | 
| 328 | 
            +
             | 
| 329 | 
             
                    if entity is None:
         | 
| 330 | 
            +
                        logger.warning(f"Found unrecognized label {label}, returning entity as is")
         | 
| 331 | 
            +
                        return label
         | 
| 332 |  | 
| 333 | 
             
                    if entity not in self.supported_entities:
         | 
| 334 | 
             
                        logger.warning(f"Found entity {entity} which is not supported by Presidio")
         | 
| 335 | 
            +
                        return entity
         | 
| 336 | 
             
                    return entity
         |