-
Couldn't load subscription status.
- Fork 573
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
It seems to be a regression caused by #586
To Reproduce
In local environment, the bug can be easily reproduced with the following
import com.amazon.deequ.checks.{Check, CheckLevel}
import com.amazon.deequ.{VerificationResult, VerificationSuite}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
object HistogramChecker {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder
.master("local")
.appName("test")
.getOrCreate()
val schema = StructType(
Seq(
StructField("id", StringType, nullable = false),
StructField("count", IntegerType, nullable = false),
StructField("dt", StringType, nullable = false)
)
)
val rows = Seq(
Row("id_0", 1, "20240118"),
Row("id_1", 2, "20240118"),
Row("id_2", 3, "20240118")
)
val df = spark.createDataFrame(
spark.sparkContext.parallelize(rows),
schema
)
val check = Check(CheckLevel.Error, "Distinct value checks")
.hasNumberOfDistinctValues("count", _ == 3)
val result = VerificationSuite()
.onData(df)
.addCheck(check)
.run()
println(result)
VerificationResult.checkResultsAsDataFrame(spark, result).show(false)
spark.stop()
}
}
The error will be
VerificationResult(Error,Map(Check(Error,Distinct value checks,List(HistogramBinConstraint(Histogram(count,None,1000,None,false,Count)))) -> CheckResult(Check(Error,Distinct value checks,List(HistogramBinConstraint(Histogram(count,None,1000,None,false,Count)))),Error,List(ConstraintResult(HistogramBinConstraint(Histogram(count,None,1000,None,false,Count)),Failure,Some([UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `com_amazon_deequ_dq_metrics_count` cannot be resolved. Did you mean one of the following? [`com_amazon_deequ_dq_metrics_count`, `com_amazon_deequ_dq_metrics_count`].;
'Sort ['com_amazon_deequ_dq_metrics_count DESC NULLS LAST], true
+- Project [count#12 AS com_amazon_deequ_dq_metrics_count#19, count#16L AS com_amazon_deequ_dq_metrics_count#20L]
+- Aggregate [count#12], [count#12, count(1) AS count#16L]
+- Project [coalesce(count#9, cast(NullValue as string)) AS count#12]
+- Project [cast(count#4 as string) AS count#9]
+- LogicalRDD [id#3, count#4, dt#5], false
),Some(HistogramMetric(count,Failure(org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `com_amazon_deequ_dq_metrics_count` cannot be resolved. Did you mean one of the following? [`com_amazon_deequ_dq_metrics_count`, `com_amazon_deequ_dq_metrics_count`].;
'Sort ['com_amazon_deequ_dq_metrics_count DESC NULLS LAST], true
+- Project [count#12 AS com_amazon_deequ_dq_metrics_count#19, count#16L AS com_amazon_deequ_dq_metrics_count#20L]
+- Aggregate [count#12], [count#12, count(1) AS count#16L]
+- Project [coalesce(count#9, cast(NullValue as string)) AS count#12]
+- Project [cast(count#4 as string) AS count#9]
+- LogicalRDD [id#3, count#4, dt#5], false
))))))),Map(Histogram(count,None,1000,None,false,Count) -> HistogramMetric(count,Failure(org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `com_amazon_deequ_dq_metrics_count` cannot be resolved. Did you mean one of the following? [`com_amazon_deequ_dq_metrics_count`, `com_amazon_deequ_dq_metrics_count`].;
'Sort ['com_amazon_deequ_dq_metrics_count DESC NULLS LAST], true
+- Project [count#12 AS com_amazon_deequ_dq_metrics_count#19, count#16L AS com_amazon_deequ_dq_metrics_count#20L]
+- Aggregate [count#12], [count#12, count(1) AS count#16L]
+- Project [coalesce(count#9, cast(NullValue as string)) AS count#12]
+- Project [cast(count#4 as string) AS count#9]
+- LogicalRDD [id#3, count#4, dt#5], false
))))
The content of checkResultsAsDataFrame is as below:
+---------------------+-----------+------------+-------------------------------------------------------------------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|check |check_level|check_status|constraint |constraint_status|constraint_message |
+---------------------+-----------+------------+-------------------------------------------------------------------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Distinct value checks|Error |Error |HistogramBinConstraint(Histogram(count,None,1000,None,false,Count))|Failure |[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `com_amazon_deequ_dq_metrics_count` cannot be resolved. Did you mean one of the following? [`com_amazon_deequ_dq_metrics_count`, `com_amazon_deequ_dq_metrics_count`].;\n'Sort ['com_amazon_deequ_dq_metrics_count DESC NULLS LAST], true\n+- Project [count#12 AS com_amazon_deequ_dq_metrics_count#19, count#16L AS com_amazon_deequ_dq_metrics_count#20L]\n +- Aggregate [count#12], [count#12, count(1) AS count#16L]\n +- Project [coalesce(count#9, cast(NullValue as string)) AS count#12]\n +- Project [cast(count#4 as string) AS count#9]\n +- LogicalRDD [id#3, count#4, dt#5], false\n|
+---------------------+-----------+------------+-------------------------------------------------------------------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
Expected behavior
hasNumberOfDistinctValues Check should work for DF containing columns with special names (e.g: count, sum)
Screenshots
If applicable, add screenshots to help explain your problem.
Additional context
Checked and confirmed that this only happens for 2.0.8 and above.
(I have tested with 2.0.8-spark-3.4)
huymq1710
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working