Skip to content
This repository was archived by the owner on Jul 28, 2025. It is now read-only.

Commit 44db08b

Browse files
authored
CU-8695ucw9b deid transformers fix (#490)
* CU-8695ucw9b: Fix older DeID models due to changes in transformers. Since transformers 4.42.0, the tokenizer is expected to have the 'split_special_tokens' attribute. But the version we've saved does not. So when it's loaded, this causes an exception to be raised (which is currently caught and logged by medcat). * CU-8695ucw9b: Add functionality for transformers NER to spectacularly fail upon consistent consecutive exceptions. The idea is that this way, if something in the underlying models is consistently failing, the exception is raised rather than simply logged * CU-8695ucw9b: Add tests for exception raising after a pre-defined number of failed document processes * CU-8695ucw9b: Change conditions for raising exception on consecutive failure. Now only raise the exception if the consecutive failure is identical (or similar). We determine that from the type and string-representation of the exception being raised. * CU-8695ucw9b: Small additional cleanup on successful TNER processing * CU-8695ucw9b: Use custom exception when failing due to consecutive exceptions * CU-8695ucw9b: Remove try-except when processing transformers NER to force immediate raising of exception
1 parent b433195 commit 44db08b

File tree

1 file changed

+34
-29
lines changed

1 file changed

+34
-29
lines changed

medcat/ner/transformers_ner.py

Lines changed: 34 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import datasets
55
from spacy.tokens import Doc
66
from datetime import datetime
7-
from typing import Iterable, Iterator, Optional, Dict, List, cast, Union, Tuple, Callable
7+
from typing import Iterable, Iterator, Optional, Dict, List, cast, Union, Tuple, Callable, Type
88
from spacy.tokens import Span
99
import inspect
1010
from functools import partial
@@ -87,7 +87,13 @@ def create_eval_pipeline(self):
8787
# NOTE: this will fix the DeID model(s) created before medcat 1.9.3
8888
# though this fix may very well be unstable
8989
self.ner_pipe.tokenizer._in_target_context_manager = False
90+
# if not hasattr(self.ner_pipe.tokenizer, 'split_special_tokens'):
91+
# # NOTE: this will fix the DeID model(s) created with transformers before 4.42
92+
# # and allow them to run with later transforemrs
93+
# self.ner_pipe.tokenizer.split_special_tokens = False
9094
self.ner_pipe.device = self.model.device
95+
self._consecutive_identical_failures = 0
96+
self._last_exception: Optional[Tuple[str, Type[Exception]]] = None
9197

9298
def get_hash(self) -> str:
9399
"""A partial hash trying to catch differences between models.
@@ -390,34 +396,33 @@ def _process(self,
390396
#all_text_processed = self.tokenizer.encode_eval(all_text)
391397
# For now we will process the documents one by one, should be improved in the future to use batching
392398
for doc in docs:
393-
try:
394-
res = self.ner_pipe(doc.text, aggregation_strategy=self.config.general['ner_aggregation_strategy'])
395-
doc.ents = [] # type: ignore
396-
for r in res:
397-
inds = []
398-
for ind, word in enumerate(doc):
399-
end_char = word.idx + len(word.text)
400-
if end_char <= r['end'] and end_char > r['start']:
401-
inds.append(ind)
402-
# To not loop through everything
403-
if end_char > r['end']:
404-
break
405-
if inds:
406-
entity = Span(doc, min(inds), max(inds) + 1, label=r['entity_group'])
407-
entity._.cui = r['entity_group']
408-
entity._.context_similarity = r['score']
409-
entity._.detected_name = r['word']
410-
entity._.id = len(doc._.ents)
411-
entity._.confidence = r['score']
412-
413-
doc._.ents.append(entity)
414-
create_main_ann(self.cdb, doc)
415-
if self.cdb.config.general['make_pretty_labels'] is not None:
416-
make_pretty_labels(self.cdb, doc, LabelStyle[self.cdb.config.general['make_pretty_labels']])
417-
if self.cdb.config.general['map_cui_to_group'] is not None and self.cdb.addl_info.get('cui2group', {}):
418-
map_ents_to_groups(self.cdb, doc)
419-
except Exception as e:
420-
logger.warning(e, exc_info=True)
399+
res = self.ner_pipe(doc.text, aggregation_strategy=self.config.general['ner_aggregation_strategy'])
400+
doc.ents = [] # type: ignore
401+
for r in res:
402+
inds = []
403+
for ind, word in enumerate(doc):
404+
end_char = word.idx + len(word.text)
405+
if end_char <= r['end'] and end_char > r['start']:
406+
inds.append(ind)
407+
# To not loop through everything
408+
if end_char > r['end']:
409+
break
410+
if inds:
411+
entity = Span(doc, min(inds), max(inds) + 1, label=r['entity_group'])
412+
entity._.cui = r['entity_group']
413+
entity._.context_similarity = r['score']
414+
entity._.detected_name = r['word']
415+
entity._.id = len(doc._.ents)
416+
entity._.confidence = r['score']
417+
418+
doc._.ents.append(entity)
419+
create_main_ann(self.cdb, doc)
420+
if self.cdb.config.general['make_pretty_labels'] is not None:
421+
make_pretty_labels(self.cdb, doc, LabelStyle[self.cdb.config.general['make_pretty_labels']])
422+
if self.cdb.config.general['map_cui_to_group'] is not None and self.cdb.addl_info.get('cui2group', {}):
423+
map_ents_to_groups(self.cdb, doc)
424+
self._consecutive_identical_failures = 0 # success
425+
self._last_exception = None
421426
yield from docs
422427

423428
# Override

0 commit comments

Comments
 (0)