Skip to content

Commit b0c2ba3

Browse files
xinrong-mengueshin
authored andcommitted
[SPARK-52249][PS] Enable divide-by-zero for numeric truediv with ANSI enabled
### What changes were proposed in this pull request? Enable divide-by-zero for truediv with ANSI enabled ### Why are the changes needed? Part of https://issues.apache.org/jira/browse/SPARK-52169 ### Does this PR introduce _any_ user-facing change? Yes, divide-by-zero for truediv is enabled with ANSI enabled ```py >>> spark.conf.get("spark.sql.ansi.enabled") 'true' >>> pdf = pd.DataFrame({"a": [1.0, -1.0, 0.0, np.nan], "b": [0.0, 0.0, 0.0, 0.0]}) >>> psdf = ps.from_pandas(pdf) ``` FROM ```py >>> psdf["a"] / psdf["b"] ... pyspark.errors.exceptions.captured.ArithmeticException: [DIVIDE_BY_ZERO] Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. SQLSTATE: 22012 == DataFrame == "__div__" was called from <stdin>:1 ``` TO ```py >>> psdf["a"] / psdf["b"] 0 inf 1 -inf 2 NaN 3 NaN dtype: float64 ``` ### How was this patch tested? Unit tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #50972 from xinrong-meng/divide_0. Authored-by: Xinrong Meng <xinrong@apache.org> Signed-off-by: Takuya Ueshin <ueshin@databricks.com>
1 parent 73f1dd2 commit b0c2ba3

File tree

3 files changed

+39
-14
lines changed

3 files changed

+39
-14
lines changed

python/pyspark/pandas/data_type_ops/num_ops.py

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
_is_boolean_type,
4444
)
4545
from pyspark.pandas.typedef.typehints import extension_dtypes, pandas_on_spark_type
46+
from pyspark.pandas.utils import is_ansi_mode_enabled
4647
from pyspark.sql import functions as F, Column as PySparkColumn
4748
from pyspark.sql.types import (
4849
BooleanType,
@@ -247,14 +248,23 @@ def truediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
247248
_sanitize_list_like(right)
248249
if not is_valid_operand_for_numeric_arithmetic(right):
249250
raise TypeError("True division can not be applied to given types.")
251+
spark_session = left._internal.spark_frame.sparkSession
252+
right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type)
250253

251254
def truediv(left: PySparkColumn, right: Any) -> PySparkColumn:
252-
return F.when(
253-
F.lit(right != 0) | F.lit(right).isNull(),
254-
left.__div__(right),
255-
).otherwise(F.lit(np.inf).__div__(left))
255+
if is_ansi_mode_enabled(spark_session):
256+
return F.when(
257+
F.lit(right == 0),
258+
F.when(left < 0, F.lit(float("-inf")))
259+
.when(left > 0, F.lit(float("inf")))
260+
.otherwise(F.lit(np.nan)),
261+
).otherwise(left / right)
262+
else:
263+
return F.when(
264+
F.lit(right != 0) | F.lit(right).isNull(),
265+
left.__div__(right),
266+
).otherwise(F.lit(np.inf).__div__(left))
256267

257-
right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type)
258268
return numpy_column_op(truediv)(left, right)
259269

260270
def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
@@ -332,18 +342,27 @@ def truediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
332342
_sanitize_list_like(right)
333343
if not is_valid_operand_for_numeric_arithmetic(right):
334344
raise TypeError("True division can not be applied to given types.")
345+
spark_session = left._internal.spark_frame.sparkSession
346+
right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type)
335347

336348
def truediv(left: PySparkColumn, right: Any) -> PySparkColumn:
337-
return F.when(
338-
F.lit(right != 0) | F.lit(right).isNull(),
339-
left.__div__(right),
340-
).otherwise(
341-
F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise(
342-
F.lit(np.inf).__div__(left)
349+
if is_ansi_mode_enabled(spark_session):
350+
return F.when(
351+
F.lit(right == 0),
352+
F.when(left < 0, F.lit(float("-inf")))
353+
.when(left > 0, F.lit(float("inf")))
354+
.otherwise(F.lit(np.nan)),
355+
).otherwise(left / right)
356+
else:
357+
return F.when(
358+
F.lit(right != 0) | F.lit(right).isNull(),
359+
left.__div__(right),
360+
).otherwise(
361+
F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise(
362+
F.lit(np.inf).__div__(left)
363+
)
343364
)
344-
)
345365

346-
right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type)
347366
return numpy_column_op(truediv)(left, right)
348367

349368
def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:

python/pyspark/pandas/tests/computation/test_binary_ops.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,6 @@ def test_binary_operator_sub(self):
111111
psdf = ps.DataFrame({"a": ["x"], "b": ["y"]})
112112
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] - psdf["b"])
113113

114-
@unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
115114
def test_divide_by_zero_behavior(self):
116115
# float / float
117116
# np.float32

python/pyspark/pandas/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,13 @@ def xor(df1: PySparkDataFrame, df2: PySparkDataFrame) -> PySparkDataFrame:
10701070
)
10711071

10721072

1073+
def is_ansi_mode_enabled(spark: SparkSession) -> bool:
1074+
return (
1075+
ps.get_option("compute.ansi_mode_support", spark_session=spark)
1076+
and spark.conf.get("spark.sql.ansi.enabled") == "true"
1077+
)
1078+
1079+
10731080
def _test() -> None:
10741081
import os
10751082
import doctest

0 commit comments

Comments
 (0)