Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package org.broadinstitute.dsde.firecloud.utils

import org.broadinstitute.dsde.firecloud.model.{EntityUpdateDefinition, FlexibleModelSchema, ModelSchema}
import org.broadinstitute.dsde.firecloud.service.TSVFileSupport
import org.broadinstitute.dsde.firecloud.utils.TsvFileSupportBenchmark.TsvData
import org.openjdk.jmh.annotations.{Benchmark, Scope, State}
import org.openjdk.jmh.infra.Blackhole

object TsvFileSupportBenchmark {
@State(Scope.Thread)
class TsvData {
val entityType: String = "sample"
val memberTypeOpt: Option[String] = None
// representation of the inbound TSV row
val row: Seq[String] = Seq(
"0005",
"foo",
"\"some\tquoted\tvalue\"",
"42",
"true",
"-123.456",
"gs://some-bucket/somefile.ext",
"""{"entityType":"targetType", "entityName":"targetName"}""",
"[1,2,3,4,5]"
)
val colInfo: Seq[(String, Option[String])] = Seq(
("sample_id", None),
("string", None),
("quotedstring", None),
("int", None),
("boolean", None),
("double", None),
("file", None),
("reference", None),
("array", None)
)
val modelSchema: ModelSchema = FlexibleModelSchema
val deleteEmptyValues: Boolean = false
}
}

class TsvFileSupportBenchmark {

@Benchmark
def makeEntityRows(blackHole: Blackhole, tsvData: TsvData): EntityUpdateDefinition = {

val result: EntityUpdateDefinition = TsvFileSupportHarness.setAttributesOnEntity(tsvData.entityType,
tsvData.memberTypeOpt,
tsvData.row,
tsvData.colInfo,
tsvData.modelSchema,
tsvData.deleteEmptyValues
)
blackHole.consume(result)
result
}

}

// helper object to get access to TSVFileSupport trait
object TsvFileSupportHarness extends TSVFileSupport {}
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ import scala.concurrent.{ExecutionContext, Future}
import scala.util.{Failure, Success, Try}

/**
* The different types of tsv import/export formats
*/
* The different types of tsv import/export formats
*/
object TsvTypes {
sealed trait TsvType
case object ENTITY extends TsvType {
Expand Down Expand Up @@ -179,31 +179,74 @@ trait TSVFileSupport {
Creates an AttributeValue whose implementation is more closely tied to the value of the input.
*/
def stringToTypedAttribute(value: String): Attribute =
Try(java.lang.Integer.parseInt(value)) match {
case Success(intValue) => AttributeNumber(intValue)
case Failure(_) =>
Try(java.lang.Double.parseDouble(value)) match {
// because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN,
// if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through
// to AttributeString.
case Success(doubleValue)
if !Double.NegativeInfinity.equals(doubleValue)
&& !Double.PositiveInfinity.equals(doubleValue)
&& !Double.NaN.equals(doubleValue)
&& !matchesLiteral(value) =>
AttributeNumber(doubleValue)
case _ =>
Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")) match {
case Success(booleanValue) => AttributeBoolean(booleanValue)
case Failure(_) =>
Try(value.parseJson.convertTo[AttributeEntityReference]) match {
case Success(ref) => ref
case Failure(_) => AttributeString(value)
}
}
}
// if this value starts and ends with a quote, it should always be treated as a string
if (value.startsWith("\"") && value.endsWith("\"")) {
AttributeString(value.substring(1, value.length - 1))
} else {
// else, inspect the value to find an appropriate datatype
toAttribute(value)
}

// the `toAttribute` method checks the first non-whitespace character of inbound TSV cells.
private val possibleNumbers = "0123456789+-." // possible first characters for a number
private val possibleBooleans = "tfTF" // possible first characters for a boolean
private val possibleReferences = "{" // possible first characters for an entity reference
private val possibleNonStrings = possibleNumbers + possibleBooleans + possibleReferences

private def toAttribute(value: String): Attribute = {
val trimmed = value.trim
// empty string
if (trimmed.isEmpty) {
return AttributeString(value)
}
// find the first character of the inbound cell
val firstChar = trimmed.charAt(0)

// first char doesn't match any known starters; this is a string
if (!possibleNonStrings.contains(firstChar)) {
return AttributeString(value)
}

// could this be a number?
if (possibleNumbers.contains(firstChar)) {
Try(java.lang.Integer.parseInt(value)) match {
case Success(intValue) => return AttributeNumber(intValue)
case Failure(_) =>
Try(java.lang.Double.parseDouble(value)) match {
// because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN,
// if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through
// to AttributeString.
case Success(doubleValue)
if !Double.NegativeInfinity.equals(doubleValue)
&& !Double.PositiveInfinity.equals(doubleValue)
&& !Double.NaN.equals(doubleValue)
&& !matchesLiteral(value) =>
return AttributeNumber(doubleValue)
case _ => // noop
}
}
}

// could this be a boolean?
if (possibleBooleans.contains(firstChar)) {
Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")) match {
case Success(booleanValue) => return AttributeBoolean(booleanValue)
case _ => // noop
}
}

// could this be a reference?
if (possibleReferences.contains(firstChar)) {
Try(value.parseJson.convertTo[AttributeEntityReference]) match {
case Success(ref) => return ref
case _ => // noop
}
}

// it's a string.
AttributeString(value)
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The early exits bother me in scala code so I asked copilot to rewrite using collectFirst and this is what it came up with.

Suggested change
private def toAttribute(value: String): Attribute = {
val trimmed = value.trim
// empty string
if (trimmed.isEmpty) {
return AttributeString(value)
}
// find the first character of the inbound cell
val firstChar = trimmed.charAt(0)
// first char doesn't match any known starters; this is a string
if (!possibleNonStrings.contains(firstChar)) {
return AttributeString(value)
}
// could this be a number?
if (possibleNumbers.contains(firstChar)) {
Try(java.lang.Integer.parseInt(value)) match {
case Success(intValue) => return AttributeNumber(intValue)
case Failure(_) =>
Try(java.lang.Double.parseDouble(value)) match {
// because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN,
// if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through
// to AttributeString.
case Success(doubleValue)
if !Double.NegativeInfinity.equals(doubleValue)
&& !Double.PositiveInfinity.equals(doubleValue)
&& !Double.NaN.equals(doubleValue)
&& !matchesLiteral(value) =>
return AttributeNumber(doubleValue)
case _ => // noop
}
}
}
// could this be a boolean?
if (possibleBooleans.contains(firstChar)) {
Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")) match {
case Success(booleanValue) => return AttributeBoolean(booleanValue)
case _ => // noop
}
}
// could this be a reference?
if (possibleReferences.contains(firstChar)) {
Try(value.parseJson.convertTo[AttributeEntityReference]) match {
case Success(ref) => return ref
case _ => // noop
}
}
// it's a string.
AttributeString(value)
}
private def toAttribute(value: String): Attribute = {
val trimmed = value.trim
if (trimmed.isEmpty) {
// empty string
AttributeString(value)
} else {
val firstChar = trimmed.charAt(0)
val possibleConversions = Seq(
() =>
// first char doesn't match any known starters; this is a string
if (!possibleNonStrings.contains(firstChar)) Some(AttributeString(value))
else None,
() =>
// could this be an int?
if (possibleNumbers.contains(firstChar)) Try(java.lang.Integer.parseInt(value)).toOption.map(AttributeNumber(_))
else None,
() =>
// could this be a double?
if (possibleNumbers.contains(firstChar))
Try(java.lang.Double.parseDouble(value)).toOption
// because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN,
// if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through
// to AttributeString.
.filterNot(doubleValue =>
Double.NegativeInfinity.equals(doubleValue) || Double.PositiveInfinity.equals(doubleValue) || Double.NaN
.equals(doubleValue) || matchesLiteral(value)
)
.map(AttributeNumber(_))
else None,
() =>
// could this be a boolean?
if (possibleBooleans.contains(firstChar))
Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")).toOption.map(AttributeBoolean)
else None,
() =>
// could this be a reference?
if (possibleReferences.contains(firstChar)) Try(value.parseJson.convertTo[AttributeEntityReference]).toOption
else None
)
possibleConversions
.collectFirst { case conversion if conversion().isDefined => conversion().get }
.getOrElse(AttributeString(value))
}
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dvoet I've rewritten to be more idiomatic - what do you think?


def checkForJson(value: String): Attribute =
Try(value.parseJson) match {
case Success(_: JsObject) => AttributeValueRawJson(value)
Expand Down
Loading