Skip to content

What to do when a sentence cannot be processed? #841

@kwalcock

Description

@kwalcock

Then it just isn't a sentence.

I'm working on a processors update (details elided). The code in BalaurProcessor.annotate presently looks as below. I believe that if document.sentences.map was just changed to document.sentences.flatMap and successfully annotated sentences resulted in Some(sentence) and the exceptions resulted in None, then the problematic sentences would just disappear and not cause problems downstream when tags or other fields are suddenly None in an otherwise annotated document.

  override def annotate(document: Document): Document = {
    // Process one sentence at a time through the MTL framework.
    val partlyAnnotatedSentences = document.sentences.map { sentence =>
      val words = sentence.words
      // Lemmas are created deterministically, not through the MTL framework.
      val lemmas = lemmatize(words)

      try {
        val allLabelsAndScores = tokenClassifier.predictWithScores(words)
        val tags = mkPosTags(words, allLabelsAndScores(TASK_TO_INDEX(POS_TASK)))
        val entities = {
          val optionalEntities = mkOptionalNerLabels(words, sentence.startOffsets, sentence.endOffsets, tags, lemmas)

          mkNamedEntityLabels(words, allLabelsAndScores(TASK_TO_INDEX(NER_TASK)), optionalEntities)
        }
        val chunks = mkChunkLabels(words, allLabelsAndScores(TASK_TO_INDEX(CHUNKING_TASK)))
        val graphs = mkDependencyLabelsUsingHexaTags(
          words, lemmas, tags,
          allLabelsAndScores(TASK_TO_INDEX(HEXA_TERM_TASK)), 
          allLabelsAndScores(TASK_TO_INDEX(HEXA_NONTERM_TASK))
        )
        // Entities and norms need to still be patched and filled in, so this is only a partly annotated sentence.
        val partlyAnnotatedDocument = sentence.copy(
          tags = Some(tags), lemmas = Some(lemmas), entities = Some(entities), chunks = Some(chunks), graphs = graphs
        )

        partlyAnnotatedDocument
      }
      catch {
        // No values, not even lemmas, will be included in the annotation is there was an exception.
        case e: EncoderMaxTokensRuntimeException =>
          // TODO: at some point do something smart here
          println(s"ERROR: This sentence exceeds the maximum number of tokens for the encoder and will not be annotated: ${sentence.words.mkString(" ")}")
          sentence
        case e: AssertionError =>
          println(s"ERROR: The output of predictWithScores does not satisfy assertions.  The sentence will not be annotated: ${sentence.words.mkString(" ")}")
          sentence
      }
    }
    val partlyAnnotatedDocument = document.copy(sentences = partlyAnnotatedSentences)
    val fullyAnnotatedDocument =
        if (numericEntityRecognizerOpt.nonEmpty) {
          val numericMentions = extractNumericEntityMentions(partlyAnnotatedDocument)
          val (newLabels, newNorms) = mkLabelsAndNorms(partlyAnnotatedDocument, numericMentions)
          val fullyAnnotatedSentences = partlyAnnotatedDocument.sentences.indices.map { index =>
            partlyAnnotatedDocument.sentences(index).copy(
              entities = Some(newLabels(index)),
              norms = Some(newNorms(index))
            )
          }.toArray

          partlyAnnotatedDocument.copy(sentences = fullyAnnotatedSentences)
        }
        else partlyAnnotatedDocument

    fullyAnnotatedDocument
  }

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions