-
Notifications
You must be signed in to change notification settings - Fork 100
Open
Description
Then it just isn't a sentence.
I'm working on a processors update (details elided). The code in BalaurProcessor.annotate
presently looks as below. I believe that if document.sentences.map was just changed to document.sentences.flatMap and successfully annotated sentences resulted in Some(sentence) and the exceptions resulted in None, then the problematic sentences would just disappear and not cause problems downstream when tags or other fields are suddenly None in an otherwise annotated document.
override def annotate(document: Document): Document = {
// Process one sentence at a time through the MTL framework.
val partlyAnnotatedSentences = document.sentences.map { sentence =>
val words = sentence.words
// Lemmas are created deterministically, not through the MTL framework.
val lemmas = lemmatize(words)
try {
val allLabelsAndScores = tokenClassifier.predictWithScores(words)
val tags = mkPosTags(words, allLabelsAndScores(TASK_TO_INDEX(POS_TASK)))
val entities = {
val optionalEntities = mkOptionalNerLabels(words, sentence.startOffsets, sentence.endOffsets, tags, lemmas)
mkNamedEntityLabels(words, allLabelsAndScores(TASK_TO_INDEX(NER_TASK)), optionalEntities)
}
val chunks = mkChunkLabels(words, allLabelsAndScores(TASK_TO_INDEX(CHUNKING_TASK)))
val graphs = mkDependencyLabelsUsingHexaTags(
words, lemmas, tags,
allLabelsAndScores(TASK_TO_INDEX(HEXA_TERM_TASK)),
allLabelsAndScores(TASK_TO_INDEX(HEXA_NONTERM_TASK))
)
// Entities and norms need to still be patched and filled in, so this is only a partly annotated sentence.
val partlyAnnotatedDocument = sentence.copy(
tags = Some(tags), lemmas = Some(lemmas), entities = Some(entities), chunks = Some(chunks), graphs = graphs
)
partlyAnnotatedDocument
}
catch {
// No values, not even lemmas, will be included in the annotation is there was an exception.
case e: EncoderMaxTokensRuntimeException =>
// TODO: at some point do something smart here
println(s"ERROR: This sentence exceeds the maximum number of tokens for the encoder and will not be annotated: ${sentence.words.mkString(" ")}")
sentence
case e: AssertionError =>
println(s"ERROR: The output of predictWithScores does not satisfy assertions. The sentence will not be annotated: ${sentence.words.mkString(" ")}")
sentence
}
}
val partlyAnnotatedDocument = document.copy(sentences = partlyAnnotatedSentences)
val fullyAnnotatedDocument =
if (numericEntityRecognizerOpt.nonEmpty) {
val numericMentions = extractNumericEntityMentions(partlyAnnotatedDocument)
val (newLabels, newNorms) = mkLabelsAndNorms(partlyAnnotatedDocument, numericMentions)
val fullyAnnotatedSentences = partlyAnnotatedDocument.sentences.indices.map { index =>
partlyAnnotatedDocument.sentences(index).copy(
entities = Some(newLabels(index)),
norms = Some(newNorms(index))
)
}.toArray
partlyAnnotatedDocument.copy(sentences = fullyAnnotatedSentences)
}
else partlyAnnotatedDocument
fullyAnnotatedDocument
}
Metadata
Metadata
Assignees
Labels
No labels