What to do when a sentence cannot be processed?

Then it just isn't a sentence.

I'm working on a processors update (details elided).  The code in `BalaurProcessor.annotate` presently looks as below.  I believe that if document.sentences.map was just changed to document.sentences.flatMap and successfully annotated sentences resulted in Some(sentence) and the exceptions resulted in None, then the problematic sentences would just disappear and not cause problems downstream when tags or other fields are suddenly None in an otherwise annotated document.

```scala
  override def annotate(document: Document): Document = {
    // Process one sentence at a time through the MTL framework.
    val partlyAnnotatedSentences = document.sentences.map { sentence =>
      val words = sentence.words
      // Lemmas are created deterministically, not through the MTL framework.
      val lemmas = lemmatize(words)

      try {
        val allLabelsAndScores = tokenClassifier.predictWithScores(words)
        val tags = mkPosTags(words, allLabelsAndScores(TASK_TO_INDEX(POS_TASK)))
        val entities = {
          val optionalEntities = mkOptionalNerLabels(words, sentence.startOffsets, sentence.endOffsets, tags, lemmas)

          mkNamedEntityLabels(words, allLabelsAndScores(TASK_TO_INDEX(NER_TASK)), optionalEntities)
        }
        val chunks = mkChunkLabels(words, allLabelsAndScores(TASK_TO_INDEX(CHUNKING_TASK)))
        val graphs = mkDependencyLabelsUsingHexaTags(
          words, lemmas, tags,
          allLabelsAndScores(TASK_TO_INDEX(HEXA_TERM_TASK)), 
          allLabelsAndScores(TASK_TO_INDEX(HEXA_NONTERM_TASK))
        )
        // Entities and norms need to still be patched and filled in, so this is only a partly annotated sentence.
        val partlyAnnotatedDocument = sentence.copy(
          tags = Some(tags), lemmas = Some(lemmas), entities = Some(entities), chunks = Some(chunks), graphs = graphs
        )

        partlyAnnotatedDocument
      }
      catch {
        // No values, not even lemmas, will be included in the annotation is there was an exception.
        case e: EncoderMaxTokensRuntimeException =>
          // TODO: at some point do something smart here
          println(s"ERROR: This sentence exceeds the maximum number of tokens for the encoder and will not be annotated: ${sentence.words.mkString(" ")}")
          sentence
        case e: AssertionError =>
          println(s"ERROR: The output of predictWithScores does not satisfy assertions.  The sentence will not be annotated: ${sentence.words.mkString(" ")}")
          sentence
      }
    }
    val partlyAnnotatedDocument = document.copy(sentences = partlyAnnotatedSentences)
    val fullyAnnotatedDocument =
        if (numericEntityRecognizerOpt.nonEmpty) {
          val numericMentions = extractNumericEntityMentions(partlyAnnotatedDocument)
          val (newLabels, newNorms) = mkLabelsAndNorms(partlyAnnotatedDocument, numericMentions)
          val fullyAnnotatedSentences = partlyAnnotatedDocument.sentences.indices.map { index =>
            partlyAnnotatedDocument.sentences(index).copy(
              entities = Some(newLabels(index)),
              norms = Some(newNorms(index))
            )
          }.toArray

          partlyAnnotatedDocument.copy(sentences = fullyAnnotatedSentences)
        }
        else partlyAnnotatedDocument

    fullyAnnotatedDocument
  }
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

What to do when a sentence cannot be processed? #841

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

What to do when a sentence cannot be processed? #841

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions