rapidsai
diff --git a/‎python/cudf/cudf/core/dataframe.py
Lines changed: 2 additions & 2 deletions b/‎python/cudf/cudf/core/dataframe.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/cudf/cudf/core/indexed_frame.py
Lines changed: 3 additions & 5 deletions b/‎python/cudf/cudf/core/indexed_frame.py
Lines changed: 3 additions & 5 deletions
diff --git a/‎python/cudf/cudf/core/series.py
Lines changed: 2 additions & 2 deletions b/‎python/cudf/cudf/core/series.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/cudf/cudf/core/udf/groupby_utils.py
Lines changed: 68 additions & 43 deletions b/‎python/cudf/cudf/core/udf/groupby_utils.py
Lines changed: 68 additions & 43 deletions
diff --git a/‎python/cudf/cudf/core/udf/masked_typing.py
Lines changed: 9 additions & 1 deletion b/‎python/cudf/cudf/core/udf/masked_typing.py
Lines changed: 9 additions & 1 deletion
diff --git a/‎python/cudf/cudf/core/udf/nrt_utils.py
Lines changed: 45 additions & 0 deletions b/‎python/cudf/cudf/core/udf/nrt_utils.py
Lines changed: 45 additions & 0 deletions
@@ -82,7 +82,7 @@
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import DataFrameResampler
 from cudf.core.series import Series
-from cudf.core.udf.row_function import _get_row_kernel
+from cudf.core.udf.row_function import DataFrameApplyKernel
 from cudf.errors import MixedTypeError
 from cudf.utils import applyutils, docutils, ioutils, queryutils
 from cudf.utils.docutils import copy_docstring
@@ -4903,7 +4903,7 @@ def apply(
         if by_row != "compat":
             raise NotImplementedError("by_row is currently not supported.")
 
-        return self._apply(func, _get_row_kernel, *args, **kwargs)
+        return self._apply(func, DataFrameApplyKernel, *args, **kwargs)
 
     def applymap(
         self,
 
@@ -53,7 +53,6 @@
 from cudf.core.resample import _Resampler
 from cudf.core.scalar import pa_scalar_to_plc_scalar
 from cudf.core.udf.utils import (
-    _compile_or_get,
     _get_input_args_from_frame,
     _post_process_output_col,
     _return_arr_from_dtype,
@@ -3471,14 +3470,13 @@ def add_suffix(self, suffix, axis=None):
 
     @acquire_spill_lock()
     @_performance_tracking
-    def _apply(self, func, kernel_getter, *args, **kwargs):
+    def _apply(self, func, kernel_class, *args, **kwargs):
         """Apply `func` across the rows of the frame."""
         if kwargs:
             raise ValueError("UDFs using **kwargs are not yet supported.")
         try:
-            kernel, retty = _compile_or_get(
-                self, func, args, kernel_getter=kernel_getter
-            )
+            kr = kernel_class(self, func, args)
+            kernel, retty = kr.get_kernel()
         except Exception as e:
             raise ValueError(
                 "user defined function compilation failed."
 
@@ -59,7 +59,7 @@
 )
 from cudf.core.resample import SeriesResampler
 from cudf.core.single_column_frame import SingleColumnFrame
-from cudf.core.udf.scalar_function import _get_scalar_kernel
+from cudf.core.udf.scalar_function import SeriesApplyKernel
 from cudf.utils import docutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
@@ -2636,7 +2636,7 @@ def apply(
         elif by_row != "compat":
             raise NotImplementedError("by_row is currently not supported.")
 
-        result = self._apply(func, _get_scalar_kernel, *args, **kwargs)
+        result = self._apply(func, SeriesApplyKernel, *args, **kwargs)
         result.name = self.name
         return result
 
 
@@ -1,6 +1,8 @@
 # Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 
+from functools import cache
+
 import cupy as cp
 import numpy as np
 from numba import cuda, types
@@ -19,15 +21,12 @@
     group_initializer_template,
     groupby_apply_kernel_template,
 )
+from cudf.core.udf.udf_kernel_base import ApplyKernelBase
 from cudf.core.udf.utils import (
     UDFError,
     _all_dtypes_from_frame,
-    _compile_or_get,
     _get_extensionty_size,
-    _get_kernel,
-    _get_udf_return_type,
     _supported_cols_from_frame,
-    _supported_dtypes_from_frame,
 )
 from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.performance_tracking import _performance_tracking
@@ -105,27 +104,6 @@ def _groupby_apply_kernel_string_from_template(frame, args):
     )
 
 
-def _get_groupby_apply_kernel(frame, func, args):
-    np_field_types = np.dtype(list(_all_dtypes_from_frame(frame).items()))
-    dataframe_group_type = _get_frame_groupby_type(
-        np_field_types, frame.index.dtype
-    )
-
-    return_type = _get_udf_return_type(dataframe_group_type, func, args)
-
-    # Dict of 'local' variables into which `_kernel` is defined
-    global_exec_context = {
-        "cuda": cuda,
-        "Group": Group,
-        "dataframe_group_type": dataframe_group_type,
-        "types": types,
-    }
-    kernel_string = _groupby_apply_kernel_string_from_template(frame, args)
-    kernel = _get_kernel(kernel_string, global_exec_context, None, func)
-
-    return kernel, return_type
-
-
 @_performance_tracking
 def jit_groupby_apply(offsets, grouped_values, function, *args):
     """
@@ -143,13 +121,8 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
         The user-defined function to execute
     """
 
-    kernel, return_type = _compile_or_get(
-        grouped_values,
-        function,
-        args,
-        kernel_getter=_get_groupby_apply_kernel,
-        suffix="__GROUPBY_APPLY_UDF",
-    )
+    kr = GroupByApplyKernel(grouped_values, function, args)
+    kernel, return_type = kr.get_kernel()
 
     offsets = cp.asarray(offsets)
     ngroups = len(offsets) - 1
@@ -211,18 +184,70 @@ def _can_be_jitted(frame, func, args):
 
     if any(col.has_nulls() for col in frame._columns):
         return False
-    np_field_types = np.dtype(
-        list(
-            _supported_dtypes_from_frame(
-                frame, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES
-            ).items()
-        )
-    )
-    dataframe_group_type = _get_frame_groupby_type(
-        np_field_types, frame.index.dtype
-    )
+    kr = GroupByApplyKernel(frame, func, args)
     try:
-        _get_udf_return_type(dataframe_group_type, func, args)
+        kr._get_udf_return_type()
         return True
     except (UDFError, TypingError):
         return False
+
+
+class GroupByApplyKernel(ApplyKernelBase):
+    """
+    Class representing a kernel that computes the result of
+    a GroupBy.apply operation. Expects that the user passed
+    a function that operates on a single group of the data,
+    for example
+
+    def f(group):
+        return group['x'].sum() + group['y'].sum()
+    """
+
+    @property
+    def kernel_type(self):
+        return "groupby_apply"
+
+    def _get_frame_type(self):
+        return _get_frame_groupby_type(
+            np.dtype(list(_all_dtypes_from_frame(self.frame).items())),
+            self.frame.index.dtype,
+        )
+
+    def _get_kernel_string(self):
+        # Create argument list for kernel
+        frame = _supported_cols_from_frame(
+            self.frame, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES
+        )
+        input_columns = ", ".join(
+            [f"input_col_{i}" for i in range(len(frame))]
+        )
+        extra_args = ", ".join(
+            [f"extra_arg_{i}" for i in range(len(self.args))]
+        )
+
+        # Generate the initializers for each device function argument
+        initializers = []
+        for i, colname in enumerate(frame.keys()):
+            initializers.append(
+                group_initializer_template.format(idx=i, name=colname)
+            )
+
+        return groupby_apply_kernel_template.format(
+            input_columns=input_columns,
+            extra_args=extra_args,
+            group_initializers="\n".join(initializers),
+        )
+
+    @cache
+    def _get_kernel_string_exec_context(self):
+        dataframe_group_type = self._get_frame_type()
+        global_exec_context = {
+            "cuda": cuda,
+            "Group": Group,
+            "dataframe_group_type": dataframe_group_type,
+            "types": types,
+        }
+        return global_exec_context
+
+    def _construct_signature(self, return_type):
+        return None
@@ -1,9 +1,10 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import operator
 
 import numpy as np
 from numba import types
+from numba.core.datamodel import default_manager
 from numba.core.extending import (
     make_attribute_wrapper,
     models,
@@ -28,6 +29,7 @@
     comparison_ops,
     unary_ops,
 )
+from cudf.core.udf.nrt_utils import _current_nrt_context
 from cudf.core.udf.strings_typing import (
     StringView,
     UDFString,
@@ -108,6 +110,12 @@ class MaskedType(types.Type):
     def __init__(self, value):
         # MaskedType in Numba shall be parameterized
         # with a value type
+        if default_manager[value].has_nrt_meminfo():
+            ctx = _current_nrt_context.get(None)
+            if ctx is not None:
+                # we're in a compilation that is determining
+                # if NRT must be linked
+                ctx.use_nrt = True
         self.value_type = _type_to_masked_type(value)
         super().__init__(name=f"Masked({self.value_type})")
 
 
@@ -0,0 +1,45 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+import contextvars
+from contextlib import contextmanager
+
+from numba import config as numba_config
+
+_current_nrt_context: contextvars.ContextVar = contextvars.ContextVar(
+    "current_nrt_context"
+)
+
+
+class CaptureNRTUsage:
+    """
+    Context manager for determining if NRT is needed.
+    Managed types may set use_nrt to be true during
+    instantiation to signal that NRT must be enabled
+    during code generation.
+    """
+
+    def __init__(self):
+        self.use_nrt = False
+
+    def __enter__(self):
+        self._token = _current_nrt_context.set(self)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        _current_nrt_context.reset(self._token)
+
+
+@contextmanager
+def nrt_enabled():
+    """
+    Context manager for enabling NRT via the numba
+    config. CUDA_ENABLE_NRT may be toggled dynamically
+    for a single kernel launch, so we use this context
+    to enable it for those that we know need it.
+    """
+    original_value = numba_config.CUDA_ENABLE_NRT
+    numba_config.CUDA_ENABLE_NRT = True
+    try:
+        yield
+    finally:
+        numba_config.CUDA_ENABLE_NRT = original_value