From 648c5f274c4af9541f1d71c4c50db6961aecc3ed Mon Sep 17 00:00:00 2001 From: David An Date: Tue, 17 Dec 2024 16:43:34 -0500 Subject: [PATCH 1/3] TsvFileSupport benchmark --- .../utils/TsvFileSupportBenchmark.scala | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 benchmarks/src/main/scala/org/broadinstitute/dsde/firecloud/utils/TsvFileSupportBenchmark.scala diff --git a/benchmarks/src/main/scala/org/broadinstitute/dsde/firecloud/utils/TsvFileSupportBenchmark.scala b/benchmarks/src/main/scala/org/broadinstitute/dsde/firecloud/utils/TsvFileSupportBenchmark.scala new file mode 100644 index 000000000..fc0d40102 --- /dev/null +++ b/benchmarks/src/main/scala/org/broadinstitute/dsde/firecloud/utils/TsvFileSupportBenchmark.scala @@ -0,0 +1,61 @@ +package org.broadinstitute.dsde.firecloud.utils + +import org.broadinstitute.dsde.firecloud.model.{EntityUpdateDefinition, FlexibleModelSchema, ModelSchema} +import org.broadinstitute.dsde.firecloud.service.TSVFileSupport +import org.broadinstitute.dsde.firecloud.utils.TsvFileSupportBenchmark.TsvData +import org.openjdk.jmh.annotations.{Benchmark, Scope, State} +import org.openjdk.jmh.infra.Blackhole + +object TsvFileSupportBenchmark { + @State(Scope.Thread) + class TsvData { + val entityType: String = "sample" + val memberTypeOpt: Option[String] = None + // representation of the inbound TSV row + val row: Seq[String] = Seq( + "0005", + "foo", + "\"some\tquoted\tvalue\"", + "42", + "true", + "-123.456", + "gs://some-bucket/somefile.ext", + """{"entityType":"targetType", "entityName":"targetName"}""", + "[1,2,3,4,5]" + ) + val colInfo: Seq[(String, Option[String])] = Seq( + ("sample_id", None), + ("string", None), + ("quotedstring", None), + ("int", None), + ("boolean", None), + ("double", None), + ("file", None), + ("reference", None), + ("array", None) + ) + val modelSchema: ModelSchema = FlexibleModelSchema + val deleteEmptyValues: Boolean = false + } +} + +class TsvFileSupportBenchmark { + + @Benchmark + def makeEntityRows(blackHole: Blackhole, tsvData: TsvData): EntityUpdateDefinition = { + + val result: EntityUpdateDefinition = TsvFileSupportHarness.setAttributesOnEntity(tsvData.entityType, + tsvData.memberTypeOpt, + tsvData.row, + tsvData.colInfo, + tsvData.modelSchema, + tsvData.deleteEmptyValues + ) + blackHole.consume(result) + result + } + +} + +// helper object to get access to TSVFileSupport trait +object TsvFileSupportHarness extends TSVFileSupport {} From ed275ec29fd487778f035b9a515377528ee00150 Mon Sep 17 00:00:00 2001 From: David An Date: Tue, 17 Dec 2024 17:21:06 -0500 Subject: [PATCH 2/3] check first character of inbound TSV cell --- .../firecloud/service/TSVFileSupport.scala | 93 ++++++++++++++----- 1 file changed, 68 insertions(+), 25 deletions(-) diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/service/TSVFileSupport.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/service/TSVFileSupport.scala index b7bffc333..281ebd8a5 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/service/TSVFileSupport.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/service/TSVFileSupport.scala @@ -20,8 +20,8 @@ import scala.concurrent.{ExecutionContext, Future} import scala.util.{Failure, Success, Try} /** - * The different types of tsv import/export formats - */ + * The different types of tsv import/export formats + */ object TsvTypes { sealed trait TsvType case object ENTITY extends TsvType { @@ -179,31 +179,74 @@ trait TSVFileSupport { Creates an AttributeValue whose implementation is more closely tied to the value of the input. */ def stringToTypedAttribute(value: String): Attribute = - Try(java.lang.Integer.parseInt(value)) match { - case Success(intValue) => AttributeNumber(intValue) - case Failure(_) => - Try(java.lang.Double.parseDouble(value)) match { - // because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN, - // if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through - // to AttributeString. - case Success(doubleValue) - if !Double.NegativeInfinity.equals(doubleValue) - && !Double.PositiveInfinity.equals(doubleValue) - && !Double.NaN.equals(doubleValue) - && !matchesLiteral(value) => - AttributeNumber(doubleValue) - case _ => - Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")) match { - case Success(booleanValue) => AttributeBoolean(booleanValue) - case Failure(_) => - Try(value.parseJson.convertTo[AttributeEntityReference]) match { - case Success(ref) => ref - case Failure(_) => AttributeString(value) - } - } - } + // if this value starts and ends with a quote, it should always be treated as a string + if (value.startsWith("\"") && value.endsWith("\"")) { + AttributeString(value.substring(1, value.length - 1)) + } else { + // else, inspect the value to find an appropriate datatype + toAttribute(value) + } + + // the `toAttribute` method checks the first non-whitespace character of inbound TSV cells. + private val possibleNumbers = "0123456789+-." // possible first characters for a number + private val possibleBooleans = "tfTF" // possible first characters for a boolean + private val possibleReferences = "{" // possible first characters for an entity reference + private val possibleNonStrings = possibleNumbers + possibleBooleans + possibleReferences + + private def toAttribute(value: String): Attribute = { + val trimmed = value.trim + // empty string + if (trimmed.isEmpty) { + return AttributeString(value) + } + // find the first character of the inbound cell + val firstChar = trimmed.charAt(0) + + // first char doesn't match any known starters; this is a string + if (!possibleNonStrings.contains(firstChar)) { + return AttributeString(value) + } + + // could this be a number? + if (possibleNumbers.contains(firstChar)) { + Try(java.lang.Integer.parseInt(value)) match { + case Success(intValue) => return AttributeNumber(intValue) + case Failure(_) => + Try(java.lang.Double.parseDouble(value)) match { + // because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN, + // if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through + // to AttributeString. + case Success(doubleValue) + if !Double.NegativeInfinity.equals(doubleValue) + && !Double.PositiveInfinity.equals(doubleValue) + && !Double.NaN.equals(doubleValue) + && !matchesLiteral(value) => + return AttributeNumber(doubleValue) + case _ => // noop + } + } + } + + // could this be a boolean? + if (possibleBooleans.contains(firstChar)) { + Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")) match { + case Success(booleanValue) => return AttributeBoolean(booleanValue) + case _ => // noop + } } + // could this be a reference? + if (possibleReferences.contains(firstChar)) { + Try(value.parseJson.convertTo[AttributeEntityReference]) match { + case Success(ref) => return ref + case _ => // noop + } + } + + // it's a string. + AttributeString(value) + } + def checkForJson(value: String): Attribute = Try(value.parseJson) match { case Success(_: JsObject) => AttributeValueRawJson(value) From 6ad61e4898a3b1cbc3cc91d210ce029b47196b05 Mon Sep 17 00:00:00 2001 From: David An Date: Wed, 18 Dec 2024 14:55:05 -0500 Subject: [PATCH 3/3] more idiomatic Scala --- .../firecloud/service/TSVFileSupport.scala | 82 ++++++++----------- .../service/TSVFileSupportSpec.scala | 2 +- 2 files changed, 37 insertions(+), 47 deletions(-) diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/service/TSVFileSupport.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/service/TSVFileSupport.scala index 281ebd8a5..2528ef958 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/service/TSVFileSupport.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/service/TSVFileSupport.scala @@ -179,8 +179,8 @@ trait TSVFileSupport { Creates an AttributeValue whose implementation is more closely tied to the value of the input. */ def stringToTypedAttribute(value: String): Attribute = - // if this value starts and ends with a quote, it should always be treated as a string - if (value.startsWith("\"") && value.endsWith("\"")) { + if (value.length > 1 && value.startsWith("\"") && value.endsWith("\"")) { + // if this value starts and ends with a quote, it should always be treated as a string AttributeString(value.substring(1, value.length - 1)) } else { // else, inspect the value to find an appropriate datatype @@ -197,54 +197,44 @@ trait TSVFileSupport { val trimmed = value.trim // empty string if (trimmed.isEmpty) { - return AttributeString(value) - } - // find the first character of the inbound cell - val firstChar = trimmed.charAt(0) - - // first char doesn't match any known starters; this is a string - if (!possibleNonStrings.contains(firstChar)) { - return AttributeString(value) - } - - // could this be a number? - if (possibleNumbers.contains(firstChar)) { - Try(java.lang.Integer.parseInt(value)) match { - case Success(intValue) => return AttributeNumber(intValue) - case Failure(_) => - Try(java.lang.Double.parseDouble(value)) match { - // because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN, - // if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through - // to AttributeString. - case Success(doubleValue) - if !Double.NegativeInfinity.equals(doubleValue) - && !Double.PositiveInfinity.equals(doubleValue) - && !Double.NaN.equals(doubleValue) - && !matchesLiteral(value) => - return AttributeNumber(doubleValue) - case _ => // noop + AttributeString(value) + } else { + // find the first character of the inbound cell + val firstChar = trimmed.charAt(0) + + // using the first character, try to convert the cell to a number, boolean, or reference + val maybeNonString: Option[Attribute] = firstChar match { + // first char doesn't match any known starters; this is a string + case _ if !possibleNonStrings.contains(firstChar) => Some(AttributeString(value)) + // could this be a number? + case _ if possibleNumbers.contains(firstChar) => + Try(java.lang.Integer.parseInt(value)) match { + case Success(intValue) => Some(AttributeNumber(intValue)) + case Failure(_) => + Try(java.lang.Double.parseDouble(value)) match { + // because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN, + // if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through + // to AttributeString. + case Success(doubleValue) + if !Double.NegativeInfinity.equals(doubleValue) + && !Double.PositiveInfinity.equals(doubleValue) + && !Double.NaN.equals(doubleValue) + && !matchesLiteral(value) => + Some(AttributeNumber(doubleValue)) + case _ => None + } } + // could this be a boolean? + case _ if possibleBooleans.contains(firstChar) => + Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")).toOption.map(AttributeBoolean) + // could this be a boolean? + case _ if possibleReferences.contains(firstChar) => + Try(value.parseJson.convertTo[AttributeEntityReference]).toOption } - } - - // could this be a boolean? - if (possibleBooleans.contains(firstChar)) { - Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")) match { - case Success(booleanValue) => return AttributeBoolean(booleanValue) - case _ => // noop - } - } - // could this be a reference? - if (possibleReferences.contains(firstChar)) { - Try(value.parseJson.convertTo[AttributeEntityReference]) match { - case Success(ref) => return ref - case _ => // noop - } + // if we didn't find one of the other attributes, it's a string. + maybeNonString.getOrElse(AttributeString(value)) } - - // it's a string. - AttributeString(value) } def checkForJson(value: String): Attribute = diff --git a/src/test/scala/org/broadinstitute/dsde/firecloud/service/TSVFileSupportSpec.scala b/src/test/scala/org/broadinstitute/dsde/firecloud/service/TSVFileSupportSpec.scala index 57c0bd160..48c735b6c 100644 --- a/src/test/scala/org/broadinstitute/dsde/firecloud/service/TSVFileSupportSpec.scala +++ b/src/test/scala/org/broadinstitute/dsde/firecloud/service/TSVFileSupportSpec.scala @@ -116,7 +116,7 @@ class TSVFileSupportSpec extends AnyFreeSpec with TSVFileSupport { Double.MinPositiveValue.toString -> AttributeNumber(Double.MinPositiveValue), Double.MaxValue.toString -> AttributeNumber(Double.MaxValue) ) - val stringTestCases = List("", "string", "true525600", ",") + val stringTestCases = List("", "string", "true525600", ",", "\"") val referenceTestCases = Map( """{"entityType":"targetType","entityName":"targetName"}""" -> AttributeEntityReference("targetType", "targetName"