Skip to content

Commit 2c3055d

Browse files
committed
Merge pull request #16812 from rapidsai/branch-24.08
2 parents e776742 + 2607537 commit 2c3055d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+2766
-453
lines changed

.github/workflows/pr.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ jobs:
2727
- wheel-tests-cudf
2828
- wheel-build-cudf-polars
2929
- wheel-tests-cudf-polars
30+
- cudf-polars-polars-tests
3031
- wheel-build-dask-cudf
3132
- wheel-tests-dask-cudf
3233
- devcontainer
@@ -154,6 +155,17 @@ jobs:
154155
# This always runs, but only fails if this PR touches code in
155156
# pylibcudf or cudf_polars
156157
script: "ci/test_wheel_cudf_polars.sh"
158+
cudf-polars-polars-tests:
159+
needs: wheel-build-cudf-polars
160+
secrets: inherit
161+
uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
162+
with:
163+
# This selects "ARCH=amd64 + the latest supported Python + CUDA".
164+
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
165+
build_type: pull-request
166+
# This always runs, but only fails if this PR touches code in
167+
# pylibcudf or cudf_polars
168+
script: "ci/test_cudf_polars_polars_tests.sh"
157169
wheel-build-dask-cudf:
158170
needs: wheel-build-cudf
159171
secrets: inherit

ci/run_cudf_polars_polars_tests.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
# Copyright (c) 2024, NVIDIA CORPORATION.
3+
4+
set -euo pipefail
5+
6+
# Support invoking run_cudf_polars_pytests.sh outside the script directory
7+
# Assumption, polars has been cloned in the root of the repo.
8+
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../polars/
9+
10+
DESELECTED_TESTS=(
11+
"tests/unit/test_polars_import.py::test_polars_import" # relies on a polars built in place
12+
"tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode
13+
"tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error
14+
"tests/docs/test_user_guide.py" # No dot binary in CI image
15+
)
16+
17+
DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}")
18+
python -m pytest \
19+
--import-mode=importlib \
20+
--cache-clear \
21+
-m "" \
22+
-p cudf_polars.testing.plugin \
23+
-v \
24+
--tb=short \
25+
${DESELECTED_TESTS} \
26+
"$@" \
27+
py-polars/tests

ci/test_cudf_polars_polars_tests.sh

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/bash
2+
# Copyright (c) 2024, NVIDIA CORPORATION.
3+
4+
set -eou pipefail
5+
6+
# We will only fail these tests if the PR touches code in pylibcudf
7+
# or cudf_polars itself.
8+
# Note, the three dots mean we are doing diff between the merge-base
9+
# of upstream and HEAD. So this is asking, "does _this branch_ touch
10+
# files in cudf_polars/pylibcudf", rather than "are there changes
11+
# between upstream and this branch which touch cudf_polars/pylibcudf"
12+
# TODO: is the target branch exposed anywhere in an environment variable?
13+
if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
14+
then
15+
HAS_CHANGES=1
16+
rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
17+
else
18+
HAS_CHANGES=0
19+
rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
20+
fi
21+
22+
rapids-logger "Download wheels"
23+
24+
RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
25+
RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
26+
27+
# Download the cudf built in the previous step
28+
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
29+
30+
rapids-logger "Install cudf"
31+
python -m pip install ./local-cudf-dep/cudf*.whl
32+
33+
rapids-logger "Install cudf_polars"
34+
python -m pip install $(echo ./dist/cudf_polars*.whl)
35+
36+
# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
37+
TAG="py-1.7.0"
38+
rapids-logger "Clone polars to ${TAG}"
39+
git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1
40+
41+
# Install requirements for running polars tests
42+
rapids-logger "Install polars test requirements"
43+
python -m pip install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt
44+
45+
function set_exitcode()
46+
{
47+
EXITCODE=$?
48+
}
49+
EXITCODE=0
50+
trap set_exitcode ERR
51+
set +e
52+
53+
rapids-logger "Run polars tests"
54+
./ci/run_cudf_polars_polars_tests.sh
55+
56+
trap ERR
57+
set -e
58+
59+
if [ ${EXITCODE} != 0 ]; then
60+
rapids-logger "Running polars test suite FAILED: exitcode ${EXITCODE}"
61+
else
62+
rapids-logger "Running polars test suite PASSED"
63+
fi
64+
65+
if [ ${HAS_CHANGES} == 1 ]; then
66+
exit ${EXITCODE}
67+
else
68+
exit 0
69+
fi

ci/test_wheel_cudf_polars.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,29 @@ set -eou pipefail
1313
if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
1414
then
1515
HAS_CHANGES=1
16+
rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
1617
else
1718
HAS_CHANGES=0
19+
rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
1820
fi
1921

22+
rapids-logger "Download wheels"
23+
2024
RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
2125
RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
2226

2327
# Download the cudf built in the previous step
2428
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
29+
30+
rapids-logger "Install cudf"
2531
python -m pip install ./local-cudf-dep/cudf*.whl
2632

2733
rapids-logger "Install cudf_polars"
2834
python -m pip install $(echo ./dist/cudf_polars*.whl)[test]
2935

36+
rapids-logger "Pin to 1.7.0 Temporarily"
37+
python -m pip install polars==1.7.0
38+
3039
rapids-logger "Run cudf_polars tests"
3140

3241
function set_exitcode()

cpp/include/cudf/detail/indexalator.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ struct input_indexalator : base_normalator<input_indexalator, cudf::size_type> {
9393
*/
9494
__device__ inline cudf::size_type operator[](size_type idx) const
9595
{
96-
void const* tp = p_ + (idx * this->width_);
96+
void const* tp = p_ + (static_cast<std::ptrdiff_t>(idx) * this->width_);
9797
return type_dispatcher(this->dtype_, normalize_type{}, tp);
9898
}
9999

@@ -109,7 +109,7 @@ struct input_indexalator : base_normalator<input_indexalator, cudf::size_type> {
109109
CUDF_HOST_DEVICE input_indexalator(void const* data, data_type dtype, cudf::size_type offset = 0)
110110
: base_normalator<input_indexalator, cudf::size_type>(dtype), p_{static_cast<char const*>(data)}
111111
{
112-
p_ += offset * this->width_;
112+
p_ += static_cast<std::ptrdiff_t>(offset) * this->width_;
113113
}
114114

115115
protected:
@@ -165,7 +165,7 @@ struct output_indexalator : base_normalator<output_indexalator, cudf::size_type>
165165
__device__ inline output_indexalator const operator[](size_type idx) const
166166
{
167167
output_indexalator tmp{*this};
168-
tmp.p_ += (idx * this->width_);
168+
tmp.p_ += static_cast<std::ptrdiff_t>(idx) * this->width_;
169169
return tmp;
170170
}
171171

dependencies.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -631,7 +631,7 @@ dependencies:
631631
common:
632632
- output_types: [conda, requirements, pyproject]
633633
packages:
634-
- polars>=1.0,<1.3
634+
- polars>=1.6
635635
run_dask_cudf:
636636
common:
637637
- output_types: [conda, requirements, pyproject]
Loading
Loading
Loading
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
cuDF-based GPU backend for Polars [Open Beta]
2+
=============================================
3+
4+
cuDF supports an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API.
5+
The engine supports most of the core expressions and data types as well as a growing set of more advanced dataframe manipulations
6+
and data file formats. When using the GPU engine, Polars will convert expressions into an optimized query plan and determine
7+
whether the plan is supported on the GPU. If it is not, the execution will transparently fall back to the standard Polars engine
8+
and run on the CPU.
9+
10+
Benchmark
11+
---------
12+
We reproduced the `Polars Decision Support (PDS) <https://github.com/pola-rs/polars-benchmark>`__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results:
13+
14+
.. figure:: ../_static/pds_benchmark_polars.png
15+
:width: 600px
16+
17+
18+
19+
You can see up to 13x speedup using the GPU backend on the compute-heavy PDS queries involving complex aggregation and join operations. Below are the speedups for the top performing queries:
20+
21+
22+
.. figure:: ../_static/compute_heavy_queries_polars.png
23+
:width: 1000px
24+
25+
:emphasis:`PDS-H benchmark | GPU: NVIDIA H100 PCIe | CPU: Intel Xeon W9-3495X (Sapphire Rapids) | Storage: Local NVMe`
26+
27+
You can reproduce the results by visiting the `Polars Decision Support (PDS) GitHub repository <https://github.com/pola-rs/polars-benchmark>`__.
28+
29+
Learn More
30+
----------
31+
32+
The GPU backend for Polars is now available in Open Beta and the engine is undergoing rapid development. To learn more, visit the `GPU Support page <https://docs.pola.rs/user-guide/gpu-support/>`__ on the Polars website.
33+
34+
Launch on Google Colab
35+
----------------------
36+
37+
.. figure:: ../_static/colab.png
38+
:width: 200px
39+
:target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb
40+
41+
Take the cuDF backend for Polars for a test-drive in a free GPU-enabled notebook environment using your Google account by `launching on Colab <https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb>`__.

docs/cudf/source/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,6 @@ other operations.
2929

3030
user_guide/index
3131
cudf_pandas/index
32+
cudf_polars/index
3233
libcudf_docs/index
3334
developer_guide/index

docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ strings
77
contains
88
replace
99
slice
10+
strip
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
=====
2+
strip
3+
=====
4+
5+
.. automodule:: cudf._lib.pylibcudf.strings.strip
6+
:members:

python/cudf/cudf/_lib/datetime.pyx

Lines changed: 5 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
1616
from cudf._lib.pylibcudf.libcudf.types cimport size_type
1717
from cudf._lib.scalar cimport DeviceScalar
1818

19+
import cudf._lib.pylibcudf as plc
20+
1921

2022
@acquire_spill_lock()
2123
def add_months(Column col, Column months):
@@ -37,43 +39,9 @@ def add_months(Column col, Column months):
3739

3840
@acquire_spill_lock()
3941
def extract_datetime_component(Column col, object field):
40-
41-
cdef unique_ptr[column] c_result
42-
cdef column_view col_view = col.view()
43-
44-
with nogil:
45-
if field == "year":
46-
c_result = move(libcudf_datetime.extract_year(col_view))
47-
elif field == "month":
48-
c_result = move(libcudf_datetime.extract_month(col_view))
49-
elif field == "day":
50-
c_result = move(libcudf_datetime.extract_day(col_view))
51-
elif field == "weekday":
52-
c_result = move(libcudf_datetime.extract_weekday(col_view))
53-
elif field == "hour":
54-
c_result = move(libcudf_datetime.extract_hour(col_view))
55-
elif field == "minute":
56-
c_result = move(libcudf_datetime.extract_minute(col_view))
57-
elif field == "second":
58-
c_result = move(libcudf_datetime.extract_second(col_view))
59-
elif field == "millisecond":
60-
c_result = move(
61-
libcudf_datetime.extract_millisecond_fraction(col_view)
62-
)
63-
elif field == "microsecond":
64-
c_result = move(
65-
libcudf_datetime.extract_microsecond_fraction(col_view)
66-
)
67-
elif field == "nanosecond":
68-
c_result = move(
69-
libcudf_datetime.extract_nanosecond_fraction(col_view)
70-
)
71-
elif field == "day_of_year":
72-
c_result = move(libcudf_datetime.day_of_year(col_view))
73-
else:
74-
raise ValueError(f"Invalid datetime field: '{field}'")
75-
76-
result = Column.from_unique_ptr(move(c_result))
42+
result = Column.from_pylibcudf(
43+
plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field)
44+
)
7745

7846
if field == "weekday":
7947
# Pandas counts Monday-Sunday as 0-6

python/cudf/cudf/_lib/pylibcudf/column.pyx

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,11 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
1515

1616
from .gpumemoryview cimport gpumemoryview
1717
from .scalar cimport Scalar
18-
from .types cimport DataType, type_id
18+
from .types cimport DataType, size_of, type_id
1919
from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
2020

2121
import functools
2222

23-
import numpy as np
24-
2523

2624
cdef class Column:
2725
"""A container of nullable device data as a column of elements.
@@ -303,14 +301,15 @@ cdef class Column:
303301
raise ValueError("mask not yet supported.")
304302

305303
typestr = iface['typestr'][1:]
304+
data_type = _datatype_from_dtype_desc(typestr)
305+
306306
if not is_c_contiguous(
307307
iface['shape'],
308308
iface['strides'],
309-
np.dtype(typestr).itemsize
309+
size_of(data_type)
310310
):
311311
raise ValueError("Data must be C-contiguous")
312312

313-
data_type = _datatype_from_dtype_desc(typestr)
314313
size = iface['shape'][0]
315314
return Column(
316315
data_type,

python/cudf/cudf/_lib/pylibcudf/datetime.pyx

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,16 @@ from libcpp.utility cimport move
44

55
from cudf._lib.pylibcudf.libcudf.column.column cimport column
66
from cudf._lib.pylibcudf.libcudf.datetime cimport (
7+
day_of_year as cpp_day_of_year,
8+
extract_day as cpp_extract_day,
9+
extract_hour as cpp_extract_hour,
10+
extract_microsecond_fraction as cpp_extract_microsecond_fraction,
11+
extract_millisecond_fraction as cpp_extract_millisecond_fraction,
12+
extract_minute as cpp_extract_minute,
13+
extract_month as cpp_extract_month,
14+
extract_nanosecond_fraction as cpp_extract_nanosecond_fraction,
15+
extract_second as cpp_extract_second,
16+
extract_weekday as cpp_extract_weekday,
717
extract_year as cpp_extract_year,
818
)
919

@@ -31,3 +41,42 @@ cpdef Column extract_year(
3141
with nogil:
3242
result = move(cpp_extract_year(values.view()))
3343
return Column.from_libcudf(move(result))
44+
45+
46+
def extract_datetime_component(Column col, str field):
47+
48+
cdef unique_ptr[column] c_result
49+
50+
with nogil:
51+
if field == "year":
52+
c_result = move(cpp_extract_year(col.view()))
53+
elif field == "month":
54+
c_result = move(cpp_extract_month(col.view()))
55+
elif field == "day":
56+
c_result = move(cpp_extract_day(col.view()))
57+
elif field == "weekday":
58+
c_result = move(cpp_extract_weekday(col.view()))
59+
elif field == "hour":
60+
c_result = move(cpp_extract_hour(col.view()))
61+
elif field == "minute":
62+
c_result = move(cpp_extract_minute(col.view()))
63+
elif field == "second":
64+
c_result = move(cpp_extract_second(col.view()))
65+
elif field == "millisecond":
66+
c_result = move(
67+
cpp_extract_millisecond_fraction(col.view())
68+
)
69+
elif field == "microsecond":
70+
c_result = move(
71+
cpp_extract_microsecond_fraction(col.view())
72+
)
73+
elif field == "nanosecond":
74+
c_result = move(
75+
cpp_extract_nanosecond_fraction(col.view())
76+
)
77+
elif field == "day_of_year":
78+
c_result = move(cpp_day_of_year(col.view()))
79+
else:
80+
raise ValueError(f"Invalid datetime field: '{field}'")
81+
82+
return Column.from_libcudf(move(c_result))

0 commit comments

Comments
 (0)