-
Notifications
You must be signed in to change notification settings - Fork 4
CORE-207: optimize stringToTypedAttribute in TSV upload #1519
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can take or leave my comment
private def toAttribute(value: String): Attribute = { | ||
val trimmed = value.trim | ||
// empty string | ||
if (trimmed.isEmpty) { | ||
return AttributeString(value) | ||
} | ||
// find the first character of the inbound cell | ||
val firstChar = trimmed.charAt(0) | ||
|
||
// first char doesn't match any known starters; this is a string | ||
if (!possibleNonStrings.contains(firstChar)) { | ||
return AttributeString(value) | ||
} | ||
|
||
// could this be a number? | ||
if (possibleNumbers.contains(firstChar)) { | ||
Try(java.lang.Integer.parseInt(value)) match { | ||
case Success(intValue) => return AttributeNumber(intValue) | ||
case Failure(_) => | ||
Try(java.lang.Double.parseDouble(value)) match { | ||
// because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN, | ||
// if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through | ||
// to AttributeString. | ||
case Success(doubleValue) | ||
if !Double.NegativeInfinity.equals(doubleValue) | ||
&& !Double.PositiveInfinity.equals(doubleValue) | ||
&& !Double.NaN.equals(doubleValue) | ||
&& !matchesLiteral(value) => | ||
return AttributeNumber(doubleValue) | ||
case _ => // noop | ||
} | ||
} | ||
} | ||
|
||
// could this be a boolean? | ||
if (possibleBooleans.contains(firstChar)) { | ||
Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")) match { | ||
case Success(booleanValue) => return AttributeBoolean(booleanValue) | ||
case _ => // noop | ||
} | ||
} | ||
|
||
// could this be a reference? | ||
if (possibleReferences.contains(firstChar)) { | ||
Try(value.parseJson.convertTo[AttributeEntityReference]) match { | ||
case Success(ref) => return ref | ||
case _ => // noop | ||
} | ||
} | ||
|
||
// it's a string. | ||
AttributeString(value) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The early exits bother me in scala code so I asked copilot to rewrite using collectFirst
and this is what it came up with.
private def toAttribute(value: String): Attribute = { | |
val trimmed = value.trim | |
// empty string | |
if (trimmed.isEmpty) { | |
return AttributeString(value) | |
} | |
// find the first character of the inbound cell | |
val firstChar = trimmed.charAt(0) | |
// first char doesn't match any known starters; this is a string | |
if (!possibleNonStrings.contains(firstChar)) { | |
return AttributeString(value) | |
} | |
// could this be a number? | |
if (possibleNumbers.contains(firstChar)) { | |
Try(java.lang.Integer.parseInt(value)) match { | |
case Success(intValue) => return AttributeNumber(intValue) | |
case Failure(_) => | |
Try(java.lang.Double.parseDouble(value)) match { | |
// because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN, | |
// if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through | |
// to AttributeString. | |
case Success(doubleValue) | |
if !Double.NegativeInfinity.equals(doubleValue) | |
&& !Double.PositiveInfinity.equals(doubleValue) | |
&& !Double.NaN.equals(doubleValue) | |
&& !matchesLiteral(value) => | |
return AttributeNumber(doubleValue) | |
case _ => // noop | |
} | |
} | |
} | |
// could this be a boolean? | |
if (possibleBooleans.contains(firstChar)) { | |
Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")) match { | |
case Success(booleanValue) => return AttributeBoolean(booleanValue) | |
case _ => // noop | |
} | |
} | |
// could this be a reference? | |
if (possibleReferences.contains(firstChar)) { | |
Try(value.parseJson.convertTo[AttributeEntityReference]) match { | |
case Success(ref) => return ref | |
case _ => // noop | |
} | |
} | |
// it's a string. | |
AttributeString(value) | |
} | |
private def toAttribute(value: String): Attribute = { | |
val trimmed = value.trim | |
if (trimmed.isEmpty) { | |
// empty string | |
AttributeString(value) | |
} else { | |
val firstChar = trimmed.charAt(0) | |
val possibleConversions = Seq( | |
() => | |
// first char doesn't match any known starters; this is a string | |
if (!possibleNonStrings.contains(firstChar)) Some(AttributeString(value)) | |
else None, | |
() => | |
// could this be an int? | |
if (possibleNumbers.contains(firstChar)) Try(java.lang.Integer.parseInt(value)).toOption.map(AttributeNumber(_)) | |
else None, | |
() => | |
// could this be a double? | |
if (possibleNumbers.contains(firstChar)) | |
Try(java.lang.Double.parseDouble(value)).toOption | |
// because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN, | |
// if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through | |
// to AttributeString. | |
.filterNot(doubleValue => | |
Double.NegativeInfinity.equals(doubleValue) || Double.PositiveInfinity.equals(doubleValue) || Double.NaN | |
.equals(doubleValue) || matchesLiteral(value) | |
) | |
.map(AttributeNumber(_)) | |
else None, | |
() => | |
// could this be a boolean? | |
if (possibleBooleans.contains(firstChar)) | |
Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")).toOption.map(AttributeBoolean) | |
else None, | |
() => | |
// could this be a reference? | |
if (possibleReferences.contains(firstChar)) Try(value.parseJson.convertTo[AttributeEntityReference]).toOption | |
else None | |
) | |
possibleConversions | |
.collectFirst { case conversion if conversion().isDefined => conversion().get } | |
.getOrElse(AttributeString(value)) | |
} | |
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@dvoet I've rewritten to be more idiomatic - what do you think?
This PR is an optimization found while researching CORE-207. This should be performance only; it has no functional impact to end users.
The
stringToTypedAttribute()
function is called for each cell in an uploaded TSV; this is a lot of invocations so any optimization here is useful.Raw benchmark numbers:
Have you read CONTRIBUTING.md lately? If not, do that first.
I, the developer opening this PR, do solemnly pinky swear that:
In all cases: