apache · benrobby · Jul 21, 2025 · Jul 24, 2025 · Jul 24, 2025 · Jul 24, 2025
diff --git a/python/pyspark/pandas/tests/computation/test_describe.py b/python/pyspark/pandas/tests/computation/test_describe.py
@@ -135,7 +135,7 @@ def test_describe(self):
         psdf = ps.DataFrame(
             {
                 "a": [1, 2, 3],
-                "b": [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(1)],
+                "b": [pd.Timestamp(1000), pd.Timestamp(1000), pd.Timestamp(1000)],
                 "c": [None, None, None],
             }
         )
@@ -184,8 +184,8 @@ def test_describe_empty(self):
         # Explicit empty DataFrame timestamp only
         psdf = ps.DataFrame(
             {
-                "a": [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(1)],
-                "b": [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(1)],
+                "a": [pd.Timestamp(1000), pd.Timestamp(1000), pd.Timestamp(1000)],
+                "b": [pd.Timestamp(1000), pd.Timestamp(1000), pd.Timestamp(1000)],
             }
         )
         pdf = psdf._to_pandas()
@@ -199,7 +199,7 @@ def test_describe_empty(self):
 
         # Explicit empty DataFrame numeric & timestamp
         psdf = ps.DataFrame(
-            {"a": [1, 2, 3], "b": [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(1)]}
+            {"a": [1, 2, 3], "b": [pd.Timestamp(1000), pd.Timestamp(1000), pd.Timestamp(1000)]}
         )
         pdf = psdf._to_pandas()
         pdf_result = pdf[pdf.a != pdf.a].describe()
@@ -219,7 +219,10 @@ def test_describe_empty(self):
 
         # Explicit empty DataFrame string & timestamp
         psdf = ps.DataFrame(
-            {"a": ["a", "b", "c"], "b": [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(1)]}
+            {
+                "a": ["a", "b", "c"],
+                "b": [pd.Timestamp(1000), pd.Timestamp(1000), pd.Timestamp(1000)],
+            }
         )
         pdf = psdf._to_pandas()
         pdf_result = pdf[pdf.a != pdf.a].describe()

diff --git a/python/pyspark/pandas/tests/data_type_ops/testing_utils.py b/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
@@ -80,7 +80,7 @@ def non_numeric_pdf(self):
             "date": pd.Series(
                 [datetime.date(1994, 1, 1), datetime.date(1994, 1, 2), datetime.date(1994, 1, 3)]
             ),
-            "datetime": pd.to_datetime(pd.Series([1, 2, 3])),
+            "datetime": pd.to_datetime(pd.Series([1, 2, 3]), unit="s"),
             "timedelta": pd.Series(
                 [datetime.timedelta(1), datetime.timedelta(hours=2), datetime.timedelta(weeks=3)]
             ),
@@ -127,7 +127,7 @@ def numeric_pser_psser_pairs(self):
     def non_numeric_psers(self):
         psers = {
             "string": pd.Series(["x", "y", "z"]),
-            "datetime": pd.to_datetime(pd.Series([1, 2, 3])),
+            "datetime": pd.to_datetime(pd.Series([1, 2, 3]), unit="s"),
             "bool": pd.Series([True, True, False]),
             "date": pd.Series(
                 [datetime.date(1994, 1, 1), datetime.date(1994, 1, 2), datetime.date(1994, 1, 3)]

diff --git a/python/pyspark/pandas/tests/test_numpy_compat.py b/python/pyspark/pandas/tests/test_numpy_compat.py
@@ -85,52 +85,56 @@ def test_np_unsupported_frame(self):
     def test_np_spark_compat_series(self):
         from pyspark.pandas.numpy_compat import unary_np_spark_mappings, binary_np_spark_mappings
 
-        # Use randomly generated dataFrame
-        pdf = pd.DataFrame(
-            np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=["a", "b"]
-        )
-        pdf2 = pd.DataFrame(
-            np.random.randint(-100, 100, size=(len(pdf), len(pdf.columns))), columns=["a", "b"]
-        )
-        psdf = ps.from_pandas(pdf)
-        psdf2 = ps.from_pandas(pdf2)
-
-        for np_name, spark_func in unary_np_spark_mappings.items():
-            np_func = getattr(np, np_name)
-            if np_name not in self.blacklist:
-                try:
-                    # unary ufunc
-                    self.assert_eq(np_func(pdf.a), np_func(psdf.a), almost=True)
-                except Exception as e:
-                    raise AssertionError("Test in '%s' function was failed." % np_name) from e
-
-        for np_name, spark_func in binary_np_spark_mappings.items():
-            np_func = getattr(np, np_name)
-            if np_name not in self.blacklist:
-                try:
-                    # binary ufunc
-                    self.assert_eq(np_func(pdf.a, pdf.b), np_func(psdf.a, psdf.b), almost=True)
-                    self.assert_eq(np_func(pdf.a, 1), np_func(psdf.a, 1), almost=True)
-                except Exception as e:
-                    raise AssertionError("Test in '%s' function was failed." % np_name) from e
+        # Disable arrow errors, some numpy functions produce results that exceed value ranges
+        with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}):
+            # Use randomly generated dataFrame
+            pdf = pd.DataFrame(
+                np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=["a", "b"]
+            )
+            pdf2 = pd.DataFrame(
+                np.random.randint(-100, 100, size=(len(pdf), len(pdf.columns))), columns=["a", "b"]
+            )
+            psdf = ps.from_pandas(pdf)
+            psdf2 = ps.from_pandas(pdf2)
+
+            for np_name, spark_func in unary_np_spark_mappings.items():
+                np_func = getattr(np, np_name)
+                if np_name not in self.blacklist:
+                    try:
+                        # unary ufunc
+                        self.assert_eq(np_func(pdf.a), np_func(psdf.a), almost=True)
+                    except Exception as e:
+                        raise AssertionError("Test in '%s' function was failed." % np_name) from e
 
-        # Test only top 5 for now. 'compute.ops_on_diff_frames' option increases too much time.
-        try:
-            set_option("compute.ops_on_diff_frames", True)
-            for np_name, spark_func in list(binary_np_spark_mappings.items())[:5]:
+            for np_name, spark_func in binary_np_spark_mappings.items():
                 np_func = getattr(np, np_name)
                 if np_name not in self.blacklist:
                     try:
                         # binary ufunc
-                        self.assert_eq(
-                            np_func(pdf.a, pdf2.b).sort_index(),
-                            np_func(psdf.a, psdf2.b).sort_index(),
-                            almost=True,
-                        )
+                        self.assert_eq(np_func(pdf.a, pdf.b), np_func(psdf.a, psdf.b), almost=True)
+                        self.assert_eq(np_func(pdf.a, 1), np_func(psdf.a, 1), almost=True)
                     except Exception as e:
                         raise AssertionError("Test in '%s' function was failed." % np_name) from e
-        finally:
-            reset_option("compute.ops_on_diff_frames")
+
+            # Test only top 5 for now. 'compute.ops_on_diff_frames' option increases too much time.
+            try:
+                set_option("compute.ops_on_diff_frames", True)
+                for np_name, spark_func in list(binary_np_spark_mappings.items())[:5]:
+                    np_func = getattr(np, np_name)
+                    if np_name not in self.blacklist:
+                        try:
+                            # binary ufunc
+                            self.assert_eq(
+                                np_func(pdf.a, pdf2.b).sort_index(),
+                                np_func(psdf.a, psdf2.b).sort_index(),
+                                almost=True,
+                            )
+                        except Exception as e:
+                            raise AssertionError(
+                                "Test in '%s' function was failed." % np_name
+                            ) from e
+            finally:
+                reset_option("compute.ops_on_diff_frames")
 
     @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
     def test_np_spark_compat_frame(self):

diff --git a/python/pyspark/sql/tests/arrow/test_arrow.py b/python/pyspark/sql/tests/arrow/test_arrow.py
@@ -707,8 +707,8 @@ def check_createDataFrame_with_single_data_type(self):
     def test_createDataFrame_does_not_modify_input(self):
         # Some series get converted for Spark to consume, this makes sure input is unchanged
         pdf = self.create_pandas_data_frame()
-        # Use a nanosecond value to make sure it is not truncated
-        pdf.iloc[0, 7] = pd.Timestamp(1)
+        # Use a nanosecond value that converts to microseconds without precision loss
+        pdf.iloc[0, 7] = pd.Timestamp(1000)
         # Integers with nulls will get NaNs filled with 0 and will be casted
         pdf.iloc[1, 1] = None
         pdf_copy = pdf.copy(deep=True)

diff --git a/python/pyspark/sql/tests/connect/test_connect_creation.py b/python/pyspark/sql/tests/connect/test_connect_creation.py
@@ -530,12 +530,15 @@ def test_create_dataframe_from_pandas_with_ns_timestamp(self):
         from pandas import Timestamp
         import pandas as pd
 
+        # Nanoseconds are truncated to microseconds in the serializer
+        # Arrow will throw an error if precision is lost
+        # (i.e., nanoseconds cannot be represented in microseconds)
         pdf = pd.DataFrame(
             {
                 "naive": [datetime(2019, 1, 1, 0)],
                 "aware": [
                     Timestamp(
-                        year=2019, month=1, day=1, nanosecond=500, tz=timezone(timedelta(hours=-8))
+                        year=2019, month=1, day=1, nanosecond=0, tz=timezone(timedelta(hours=-8))
                     )
                 ],
             }

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py
@@ -517,7 +517,10 @@ def test_vectorized_udf_struct_complex(self):
 
         def _scalar_f(id):
             return pd.DataFrame(
-                {"ts": id.apply(lambda i: pd.Timestamp(i)), "arr": id.apply(lambda i: [i, i + 1])}
+                {
+                    "ts": id.apply(lambda i: pd.Timestamp(i, unit="s")),
+                    "arr": id.apply(lambda i: [i, i + 1]),
+                }
             )
 
         scalar_f = pandas_udf(_scalar_f, returnType=return_type)
@@ -532,7 +535,7 @@ def iter_f(it):
             for i, row in enumerate(actual):
                 id, f = row
                 self.assertEqual(i, id)
-                self.assertEqual(pd.Timestamp(i).to_pydatetime(), f[0])
+                self.assertEqual(pd.Timestamp(i, unit="s").to_pydatetime(), f[0])
                 self.assertListEqual([i, i + 1], f[1])
 
     def test_vectorized_udf_struct_empty(self):

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -3990,7 +3990,7 @@ object SQLConf {
         "check and do type conversions anyway. This config only works for Arrow 0.11.0+.")
       .version("3.0.0")
       .booleanConf
-      .createWithDefault(false)
+      .createWithDefault(true)
 
   val PYSPARK_WORKER_PYTHON_EXECUTABLE =
     buildConf("spark.sql.execution.pyspark.python")