-
Notifications
You must be signed in to change notification settings - Fork 229
pyarrow: Check compatibility of pyarrow.array with string type #2933
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 18 commits
1f32a7c
4c4e064
07fbca6
d379e46
cfda386
0a6cda5
f59f93c
757da24
17c1e9c
0105d64
3ad0c86
4bea288
371174a
b588730
faf2065
b2efbb4
9fd77dc
ccf4eff
44d01ed
a927202
7b00248
7dc353b
ce76152
ef431af
acaf350
6ad6eb9
d88accd
265132e
edb3438
8172102
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
import pandas as pd | ||
import xarray as xr | ||
from packaging.version import Version | ||
from pygmt._typing import StringArrayTypes | ||
from pygmt.exceptions import GMTInvalidInput | ||
|
||
|
||
|
@@ -273,14 +274,15 @@ def sequence_to_ctypes_array( | |
return (ctype * size)(*sequence) | ||
|
||
|
||
def strings_to_ctypes_array(strings: Sequence[str] | np.ndarray) -> ctp.Array: | ||
def strings_to_ctypes_array(strings: StringArrayTypes) -> ctp.Array: | ||
""" | ||
Convert a sequence (e.g., a list) of strings into a ctypes array. | ||
Convert a sequence (e.g., a list) or numpy.ndarray of strings or a | ||
pyarrow.StringArray into a ctypes array. | ||
|
||
Parameters | ||
---------- | ||
strings | ||
A sequence of strings. | ||
A sequence of strings, a numpy.ndarray of str dtype, or a pyarrow.StringArray. | ||
|
||
Returns | ||
------- | ||
|
@@ -296,7 +298,7 @@ def strings_to_ctypes_array(strings: Sequence[str] | np.ndarray) -> ctp.Array: | |
>>> [s.decode() for s in ctypes_array] | ||
['first', 'second', 'third'] | ||
""" | ||
return (ctp.c_char_p * len(strings))(*[s.encode() for s in strings]) | ||
return (ctp.c_char_p * len(strings))(*[s.encode() for s in np.asarray(strings)]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With PR #3507, I believe we can revert changes in this file, and only let the function support a sequence of strings and a string dtype numpy array. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reverted in a927202 |
||
|
||
|
||
def array_to_datetime(array: Sequence[Any] | np.ndarray) -> np.ndarray: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,12 +8,31 @@ | |
from pygmt import clib | ||
from pygmt.exceptions import GMTCLibError | ||
from pygmt.helpers import GMTTempFile | ||
from pygmt.helpers.testing import skip_if_no | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also need to revert changes in this PR, since the low-level function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, just to document things, the way we are intending pyarrow.string inputs to go through high-level modules like
For the record, the original intention in this PR, mostly in commit d379e46, was to let |
||
|
||
try: | ||
import pyarrow as pa | ||
except ImportError: | ||
pa = None | ||
|
||
|
||
@pytest.mark.benchmark | ||
def test_put_strings(): | ||
@pytest.mark.parametrize( | ||
("array_func", "dtype"), | ||
[ | ||
pytest.param(np.array, {"dtype": np.str_}, id="str"), | ||
pytest.param( | ||
getattr(pa, "array", None), | ||
{"type": "string"}, # pa.string() | ||
marks=skip_if_no(package="pyarrow"), | ||
id="pyarrow", | ||
), | ||
], | ||
) | ||
def test_put_strings(array_func, dtype): | ||
""" | ||
Check that assigning a numpy array of dtype str to a dataset works. | ||
Check that assigning a numpy array of dtype str, or a pyarrow.StringArray to a | ||
dataset works. | ||
""" | ||
with clib.Session() as lib: | ||
dataset = lib.create_data( | ||
|
@@ -24,7 +43,7 @@ def test_put_strings(): | |
) | ||
x = np.array([1, 2, 3, 4, 5], dtype=np.int32) | ||
y = np.array([6, 7, 8, 9, 10], dtype=np.int32) | ||
strings = np.array(["a", "bc", "defg", "hijklmn", "opqrst"], dtype=np.str_) | ||
strings = array_func(["a", "bc", "defg", "hijklmn", "opqrst"], **dtype) | ||
lib.put_vector(dataset, column=lib["GMT_X"], vector=x) | ||
lib.put_vector(dataset, column=lib["GMT_Y"], vector=y) | ||
lib.put_strings( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,8 +10,14 @@ | |
from pygmt.clib.session import DTYPES_NUMERIC | ||
from pygmt.exceptions import GMTCLibError, GMTInvalidInput | ||
from pygmt.helpers import GMTTempFile | ||
from pygmt.helpers.testing import skip_if_no | ||
from pygmt.tests.test_clib import mock | ||
|
||
try: | ||
import pyarrow as pa | ||
except ImportError: | ||
pa = None | ||
|
||
POINTS_DATA = Path(__file__).parent / "data" / "points.txt" | ||
|
||
|
||
|
@@ -137,3 +143,37 @@ def test_open_virtual_file(): | |
bounds = "\t".join([f"<{col.min():.0f}/{col.max():.0f}>" for col in data.T]) | ||
expected = f"<matrix memory>: N = {shape[0]}\t{bounds}\n" | ||
assert output == expected | ||
|
||
|
||
@pytest.mark.benchmark | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The test was moved to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved in 6ad6eb9 |
||
@pytest.mark.parametrize( | ||
("array_func", "dtype"), | ||
[ | ||
pytest.param(np.array, {"dtype": np.str_}, id="str"), | ||
pytest.param(np.array, {"dtype": np.object_}, id="object"), | ||
pytest.param( | ||
getattr(pa, "array", None), | ||
{"type": "string"}, # pa.string() | ||
marks=skip_if_no(package="pyarrow"), | ||
id="pyarrow", | ||
), | ||
], | ||
) | ||
def test_virtualfile_from_vectors_one_string_or_object_column(array_func, dtype): | ||
""" | ||
Test passing in one column with string (numpy/pyarrow) or object (numpy) | ||
dtype into virtual file dataset. | ||
""" | ||
size = 5 | ||
x = np.arange(size, dtype=np.int32) | ||
y = np.arange(size, size * 2, 1, dtype=np.int32) | ||
strings = array_func(["a", "bc", "defg", "hijklmn", "opqrst"], **dtype) | ||
with clib.Session() as lib: | ||
with lib.virtualfile_from_vectors(x, y, strings) as vfile: | ||
with GMTTempFile() as outfile: | ||
lib.call_module("convert", [vfile, f"->{outfile.name}"]) | ||
output = outfile.read(keep_tabs=True) | ||
expected = "".join( | ||
f"{i}\t{j}\t{k}\n" for i, j, k in zip(x, y, strings, strict=True) | ||
) | ||
assert output == expected |
Uh oh!
There was an error while loading. Please reload this page.