[SPARK-52701][PS] Fix float32 type widening in mod with bool under ANSI

xinrong-meng · asl3 · commit fdee7cb022e5 · 2025-07-14T09:58:46.000-07:00
### What changes were proposed in this pull request? Fix float32 type widening in `mod` with bool under ANSI. ### Why are the changes needed? Ensure pandas on Spark works well with ANSI mode on. Part of https://issues.apache.org/jira/browse/SPARK-52700. ### Does this PR introduce _any_ user-facing change? Yes. `mod` under ANSI works as pandas. ```py (dev3.11) spark (mod_dtype) % SPARK_ANSI_SQL_MODE=False ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_num_mod NumModTests.test_mod" ... Tests passed in 8 seconds (dev3.11) spark (mod_dtype) % SPARK_ANSI_SQL_MODE=True ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_num_mod NumModTests.test_mod" ... Tests passed in 7 seconds ``` ### How was this patch tested? Unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#51394 from xinrong-meng/mod_dtype. Authored-by: Xinrong Meng <xinrong@apache.org> Signed-off-by: Xinrong Meng <xinrong@apache.org>
diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py b/python/pyspark/pandas/data_type_ops/num_ops.py
@@ -23,6 +23,7 @@
 from pandas.api.types import (  # type: ignore[attr-defined]
     is_bool_dtype,
     is_integer_dtype,
+    is_float_dtype,
     CategoricalDtype,
     is_list_like,
 )
@@ -42,7 +43,7 @@
     _is_valid_for_logical_operator,
     _is_boolean_type,
 )
-from pyspark.pandas.typedef.typehints import extension_dtypes, pandas_on_spark_type
+from pyspark.pandas.typedef.typehints import extension_dtypes, pandas_on_spark_type, as_spark_type
 from pyspark.pandas.utils import is_ansi_mode_enabled
 from pyspark.sql import functions as F, Column as PySparkColumn
 from pyspark.sql.types import (
@@ -69,6 +70,26 @@ def _non_fractional_astype(
         return _as_other_type(index_ops, dtype, spark_type)
 
 
+def _cast_back_float(
+    expr: PySparkColumn, left_dtype: Union[str, type, Dtype], right: Any
+) -> PySparkColumn:
+    """
+    Cast the result expression back to the original float dtype if needed.
+
+    This function ensures pandas on Spark matches pandas behavior when performing
+    arithmetic operations involving float and boolean values. In such cases, under ANSI mode,
+    Spark implicitly widen float32 to float64, which deviates from pandas behavior where the
+    result retains float32.
+    """
+    is_left_float = is_float_dtype(left_dtype)
+    is_right_bool = isinstance(right, bool) or (
+        hasattr(right, "dtype") and is_bool_dtype(right.dtype)
+    )
+    if is_left_float and is_right_bool:
+        return expr.cast(as_spark_type(left_dtype))
+    return expr
+
+
 class NumericOps(DataTypeOps):
     """The class for binary operations of numeric pandas-on-Spark objects."""
 
@@ -98,16 +119,19 @@ def mod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
             raise TypeError("Modulo can not be applied to given types.")
         spark_session = left._internal.spark_frame.sparkSession
 
-        def mod(left: PySparkColumn, right: Any) -> PySparkColumn:
+        def mod(left_op: PySparkColumn, right_op: Any) -> PySparkColumn:
             if is_ansi_mode_enabled(spark_session):
-                return F.when(F.lit(right == 0), F.lit(None)).otherwise(
-                    ((left % right) + right) % right
+                expr = F.when(F.lit(right_op == 0), F.lit(None)).otherwise(
+                    ((left_op % right_op) + right_op) % right_op
                 )
+                expr = _cast_back_float(expr, left.dtype, right)
             else:
-                return ((left % right) + right) % right
+                expr = ((left_op % right_op) + right_op) % right_op
+            return expr
 
-        right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type)
-        return column_op(mod)(left, right)
+        new_right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type)
+
+        return column_op(mod)(left, new_right)
 
     def pow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
         _sanitize_list_like(right)
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py b/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py
@@ -35,7 +35,6 @@ def float_pser(self):
     def float_psser(self):
         return ps.from_pandas(self.float_pser)
 
-    @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
     def test_mod(self):
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols: