Remove more top level cudf imports in core (#18862)

mroeschke · web-flow · commit 92b10138c552 · 2025-05-21T03:25:00.000Z
Towards #10820 Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #18862
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
@@ -596,6 +596,7 @@ def on_missing_reference(app, env, node, contnode):
     ("py:obj", "DataFrame.pipe"),
     ("py:meth", "pyarrow.Table.to_pandas"),
     ("py:class", "abc.Hashable"),
+    ("py:class", "cp.ndarray"),
     ("py:class", "pd.DataFrame"),
     ("py:class", "pandas.core.indexes.frozen.FrozenList"),
     ("py:class", "pa.Array"),
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
@@ -2,16 +2,20 @@
 from __future__ import annotations
 
 import warnings
+from typing import TYPE_CHECKING
 
 import cupy as cp
 import pyarrow as pa
 
 import cudf
 from cudf.core.column import as_column
-from cudf.core.index import Index
+from cudf.core.dtypes import CategoricalDtype
 from cudf.options import get_option
 from cudf.utils.dtypes import can_convert_to_column, cudf_dtype_to_pa_type
 
+if TYPE_CHECKING:
+    from cudf.core.index import Index
+
 
 def factorize(
     values,
@@ -80,7 +84,6 @@ def factorize(
     >>> uniques
     Index([<NA>, 1.0, 2.0], dtype='float64')
     """
-
     return_cupy_array = isinstance(values, cp.ndarray)
 
     if not can_convert_to_column(values):
@@ -112,8 +115,10 @@ def factorize(
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
-    return labels, cats.values if return_cupy_array else Index._from_column(
-        cats
+    # TODO: Avoid accessing Index from the top level namespace
+    return (
+        labels,
+        cats.values if return_cupy_array else cudf.Index._from_column(cats),
     )
 
 
@@ -218,6 +223,7 @@ def unique(values):
     >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values)
     array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
     """
+    # TODO: Avoid accessing Index and Series from the top level namespace
     if not isinstance(values, (cudf.Series, cudf.Index, cp.ndarray)):
         raise ValueError(
             "Must pass cudf.Series, cudf.Index, or cupy.ndarray object"
@@ -229,7 +235,7 @@ def unique(values):
         return cp.asarray(cudf.Index(values).unique())
     if isinstance(values, cudf.Series):
         if get_option("mode.pandas_compatible"):
-            if isinstance(values.dtype, cudf.CategoricalDtype):
+            if isinstance(values.dtype, CategoricalDtype):
                 raise NotImplementedError(
                     "cudf.Categorical is not implemented"
                 )
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
@@ -31,6 +31,10 @@
     from collections.abc import Hashable
 
     from cudf._typing import DtypeObj
+    from cudf.core.dataframe import DataFrame
+    from cudf.core.index import Index
+    from cudf.core.multiindex import MultiIndex
+    from cudf.core.series import Series
 
 _AXIS_MAP = {0: 0, 1: 1, "index": 0, "columns": 1}
 
@@ -114,7 +118,7 @@ def _get_combined_index(indexes, intersect: bool = False, sort=None):
 
 
 def _normalize_series_and_dataframe(
-    objs: list[cudf.Series | cudf.DataFrame], axis: Literal[0, 1]
+    objs: list[Series | DataFrame], axis: Literal[0, 1]
 ) -> None:
     """Convert any cudf.Series objects in objs to DataFrames in place."""
     # Default to naming series by a numerical id if they are not named.
@@ -545,14 +549,14 @@ def concat(
 
 
 def melt(
-    frame: cudf.DataFrame,
+    frame: DataFrame,
     id_vars=None,
     value_vars=None,
     var_name=None,
     value_name: Hashable = "value",
     col_level=None,
     ignore_index: bool = True,
-) -> cudf.DataFrame:
+) -> DataFrame:
     """Unpivots a DataFrame from wide format to long format,
     optionally leaving identifier variables set.
 
@@ -933,7 +937,9 @@ def _merge_sorted(
     if len(objs) < 1:
         raise ValueError("objs must be non-empty")
 
-    if not all(isinstance(table, cudf.core.frame.Frame) for table in objs):
+    if not all(
+        isinstance(table, (cudf.DataFrame, cudf.Series)) for table in objs
+    ):
         raise TypeError("Elements of objs must be Frame-like")
 
     if len(objs) == 1:
@@ -1003,9 +1009,9 @@ def _merge_sorted(
 
 def _pivot(
     col_accessor: ColumnAccessor,
-    index: cudf.Index | cudf.MultiIndex,
-    columns: cudf.Index | cudf.MultiIndex,
-) -> cudf.DataFrame:
+    index: Index | MultiIndex,
+    columns: Index | MultiIndex,
+) -> DataFrame:
     """
     Reorganize the values of the DataFrame according to the given
     index and columns.
@@ -1059,8 +1065,8 @@ def as_tuple(x):
 
 
 def pivot(
-    data: cudf.DataFrame, columns=None, index=no_default, values=no_default
-) -> cudf.DataFrame:
+    data: DataFrame, columns=None, index=no_default, values=no_default
+) -> DataFrame:
     """
     Return reshaped DataFrame organized by the given index and column values.
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
@@ -30,6 +30,8 @@
     import pyarrow as pa
 
     from cudf._typing import Dtype, NotImplementedType, ScalarLike
+    from cudf.core.dataframe import DataFrame
+    from cudf.core.index import Index
 
 
 class SingleColumnFrame(Frame, NotIterable):
@@ -217,15 +219,15 @@ def tolist(self) -> None:
 
     to_list = tolist
 
-    def _to_frame(
-        self, name: Hashable, index: cudf.Index | None
-    ) -> cudf.DataFrame:
+    def _to_frame(self, name: Hashable, index: Index | None) -> DataFrame:
         """Helper function for Series.to_frame, Index.to_frame"""
+
         if name is no_default:
             col_name = 0 if self.name is None else self.name
         else:
             col_name = name
         ca = ColumnAccessor({col_name: self._column}, verify=False)
+        # TODO: Avoid accessing DataFrame from the top level namespace
         return cudf.DataFrame._from_data(ca, index=index)
 
     @property  # type: ignore
@@ -279,7 +281,7 @@ def __cuda_array_interface__(self):
     @_performance_tracking
     def factorize(
         self, sort: bool = False, use_na_sentinel: bool = True
-    ) -> tuple[cupy.ndarray, cudf.Index]:
+    ) -> tuple[cupy.ndarray, Index]:
         """Encode the input values as integer labels.
 
         Parameters
@@ -309,7 +311,8 @@ def factorize(
         >>> uniques
         Index(['a', 'c'], dtype='object')
         """
-        return cudf.core.algorithms.factorize(
+        # TODO: Avoid accessing factorize from the top level namespace
+        return cudf.factorize(
             self,
             sort=sort,
             use_na_sentinel=use_na_sentinel,
@@ -344,6 +347,7 @@ def _make_operands_for_binop(
         Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]]
             The operands to be passed to _colwise_binop.
         """
+
         # Get the appropriate name for output operations involving two objects
         # that are Series-like objects. The output shares the lhs's name unless
         # the rhs is a _differently_ named Series-like object.
@@ -360,6 +364,7 @@ def _make_operands_for_binop(
             if not hasattr(
                 other, "__cuda_array_interface__"
             ) and not isinstance(other, cudf.RangeIndex):
+                # TODO: Avoid accessing RangeIndex from the top level namespace
                 return NotImplemented
 
             # Non-scalar right operands are valid iff they convert to columns.
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -4,7 +4,7 @@
 
 import pylibcudf as plc
 
-import cudf
+from cudf.core.series import Series
 
 
 class TokenizeVocabulary:
@@ -17,14 +17,14 @@ class TokenizeVocabulary:
         Strings column of vocabulary terms
     """
 
-    def __init__(self, vocabulary: cudf.Series) -> None:
+    def __init__(self, vocabulary: Series) -> None:
         self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary(
             vocabulary._column.to_pylibcudf(mode="read")
         )
 
     def tokenize(
         self, text, delimiter: str = "", default_id: int = -1
-    ) -> cudf.Series:
+    ) -> Series:
         """
         Parameters
         ----------
@@ -46,4 +46,4 @@ def tokenize(
             self.vocabulary, delimiter, default_id
         )
 
-        return cudf.Series._from_column(result)
+        return Series._from_column(result)
diff --git a/python/cudf/cudf/core/wordpiece_tokenize.py b/python/cudf/cudf/core/wordpiece_tokenize.py
@@ -4,7 +4,7 @@
 
 import pylibcudf as plc
 
-import cudf
+from cudf.core.series import Series
 
 
 class WordPieceVocabulary:
@@ -17,12 +17,12 @@ class WordPieceVocabulary:
         Strings column of vocabulary terms
     """
 
-    def __init__(self, vocabulary: cudf.Series) -> None:
+    def __init__(self, vocabulary: Series) -> None:
         self.vocabulary = plc.nvtext.wordpiece_tokenize.WordPieceVocabulary(
             vocabulary._column.to_pylibcudf(mode="read")
         )
 
-    def tokenize(self, text, max_words_per_row: int = 0) -> cudf.Series:
+    def tokenize(self, text, max_words_per_row: int = 0) -> Series:
         """
         Parameters
         ----------
@@ -43,4 +43,4 @@ def tokenize(self, text, max_words_per_row: int = 0) -> cudf.Series:
             self.vocabulary, max_words_per_row
         )
 
-        return cudf.Series._from_column(result)
+        return Series._from_column(result)