|
4 | 4 | import datasets |
5 | 5 | from spacy.tokens import Doc |
6 | 6 | from datetime import datetime |
7 | | -from typing import Iterable, Iterator, Optional, Dict, List, cast, Union, Tuple, Callable |
| 7 | +from typing import Iterable, Iterator, Optional, Dict, List, cast, Union, Tuple, Callable, Type |
8 | 8 | from spacy.tokens import Span |
9 | 9 | import inspect |
10 | 10 | from functools import partial |
@@ -87,7 +87,13 @@ def create_eval_pipeline(self): |
87 | 87 | # NOTE: this will fix the DeID model(s) created before medcat 1.9.3 |
88 | 88 | # though this fix may very well be unstable |
89 | 89 | self.ner_pipe.tokenizer._in_target_context_manager = False |
| 90 | + # if not hasattr(self.ner_pipe.tokenizer, 'split_special_tokens'): |
| 91 | + # # NOTE: this will fix the DeID model(s) created with transformers before 4.42 |
| 92 | + # # and allow them to run with later transforemrs |
| 93 | + # self.ner_pipe.tokenizer.split_special_tokens = False |
90 | 94 | self.ner_pipe.device = self.model.device |
| 95 | + self._consecutive_identical_failures = 0 |
| 96 | + self._last_exception: Optional[Tuple[str, Type[Exception]]] = None |
91 | 97 |
|
92 | 98 | def get_hash(self) -> str: |
93 | 99 | """A partial hash trying to catch differences between models. |
@@ -390,34 +396,33 @@ def _process(self, |
390 | 396 | #all_text_processed = self.tokenizer.encode_eval(all_text) |
391 | 397 | # For now we will process the documents one by one, should be improved in the future to use batching |
392 | 398 | for doc in docs: |
393 | | - try: |
394 | | - res = self.ner_pipe(doc.text, aggregation_strategy=self.config.general['ner_aggregation_strategy']) |
395 | | - doc.ents = [] # type: ignore |
396 | | - for r in res: |
397 | | - inds = [] |
398 | | - for ind, word in enumerate(doc): |
399 | | - end_char = word.idx + len(word.text) |
400 | | - if end_char <= r['end'] and end_char > r['start']: |
401 | | - inds.append(ind) |
402 | | - # To not loop through everything |
403 | | - if end_char > r['end']: |
404 | | - break |
405 | | - if inds: |
406 | | - entity = Span(doc, min(inds), max(inds) + 1, label=r['entity_group']) |
407 | | - entity._.cui = r['entity_group'] |
408 | | - entity._.context_similarity = r['score'] |
409 | | - entity._.detected_name = r['word'] |
410 | | - entity._.id = len(doc._.ents) |
411 | | - entity._.confidence = r['score'] |
412 | | - |
413 | | - doc._.ents.append(entity) |
414 | | - create_main_ann(self.cdb, doc) |
415 | | - if self.cdb.config.general['make_pretty_labels'] is not None: |
416 | | - make_pretty_labels(self.cdb, doc, LabelStyle[self.cdb.config.general['make_pretty_labels']]) |
417 | | - if self.cdb.config.general['map_cui_to_group'] is not None and self.cdb.addl_info.get('cui2group', {}): |
418 | | - map_ents_to_groups(self.cdb, doc) |
419 | | - except Exception as e: |
420 | | - logger.warning(e, exc_info=True) |
| 399 | + res = self.ner_pipe(doc.text, aggregation_strategy=self.config.general['ner_aggregation_strategy']) |
| 400 | + doc.ents = [] # type: ignore |
| 401 | + for r in res: |
| 402 | + inds = [] |
| 403 | + for ind, word in enumerate(doc): |
| 404 | + end_char = word.idx + len(word.text) |
| 405 | + if end_char <= r['end'] and end_char > r['start']: |
| 406 | + inds.append(ind) |
| 407 | + # To not loop through everything |
| 408 | + if end_char > r['end']: |
| 409 | + break |
| 410 | + if inds: |
| 411 | + entity = Span(doc, min(inds), max(inds) + 1, label=r['entity_group']) |
| 412 | + entity._.cui = r['entity_group'] |
| 413 | + entity._.context_similarity = r['score'] |
| 414 | + entity._.detected_name = r['word'] |
| 415 | + entity._.id = len(doc._.ents) |
| 416 | + entity._.confidence = r['score'] |
| 417 | + |
| 418 | + doc._.ents.append(entity) |
| 419 | + create_main_ann(self.cdb, doc) |
| 420 | + if self.cdb.config.general['make_pretty_labels'] is not None: |
| 421 | + make_pretty_labels(self.cdb, doc, LabelStyle[self.cdb.config.general['make_pretty_labels']]) |
| 422 | + if self.cdb.config.general['map_cui_to_group'] is not None and self.cdb.addl_info.get('cui2group', {}): |
| 423 | + map_ents_to_groups(self.cdb, doc) |
| 424 | + self._consecutive_identical_failures = 0 # success |
| 425 | + self._last_exception = None |
421 | 426 | yield from docs |
422 | 427 |
|
423 | 428 | # Override |
|
0 commit comments