Skip to content

[data] fix working code snippets #52748

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 1 addition & 11 deletions doc/source/data/inspecting-data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,7 @@ For more information like the number of rows, print the Dataset.

.. testoutput::

Dataset(
num_rows=150,
schema={
sepal length (cm): double,
sepal width (cm): double,
petal length (cm): double,
petal width (cm): double,
target: int64
}
)
Dataset(num_rows=..., schema=...)

.. _inspecting-rows:

Expand Down Expand Up @@ -138,7 +129,6 @@ of the returned batch, set ``batch_format``.
0 5.1 3.5 ... 0.2 0
1 4.9 3.0 ... 0.2 0
<BLANKLINE>
[2 rows x 5 columns]

For more information on working with batches, see
:ref:`Transforming batches <transforming_batches>` and
Expand Down
5 changes: 1 addition & 4 deletions doc/source/data/working-with-tensors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@ Ray Data represents tensors as

.. testoutput::

Dataset(
num_rows=100,
schema={image: numpy.ndarray(shape=(28, 28), dtype=uint8)}
)
Dataset(num_rows=100, schema=...)

Batches of fixed-shape tensors
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
10 changes: 2 additions & 8 deletions doc/source/ray-contribute/writing-code-snippets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,10 +228,7 @@ To ignore parts of a *doctest-style* output, replace problematic sections with e

>>> import ray
>>> ray.data.read_images("s3://anonymous@ray-example-data/image-datasets/simple")
Dataset(
num_rows=...,
schema={image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8)}
)
Dataset(num_rows=..., schema=...)

To ignore an output altogether, write a *code-output-style* snippet. Don't use `# doctest: +SKIP`.

Expand All @@ -249,10 +246,7 @@ with ellipses. ::

.. testoutput::

Dataset(
num_rows=...,
schema={image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8)}
)
Dataset(num_rows=..., schema=...)

If your output is nondeterministic and you want to display a sample output, add
`:options: +MOCK`. ::
Expand Down
11 changes: 1 addition & 10 deletions python/ray/data/_internal/plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ def schema(
elif self._logical_plan.dag.aggregate_output_metadata().schema is not None:
schema = self._logical_plan.dag.aggregate_output_metadata().schema

elif fetch_if_missing or self.is_read_only():
elif fetch_if_missing:
# For consistency with the previous implementation, we fetch the schema if
# the plan is read-only even if `fetch_if_missing` is False.

Expand Down Expand Up @@ -587,15 +587,6 @@ def has_lazy_input(self) -> bool:
"""Return whether this plan has lazy input blocks."""
return all(isinstance(op, Read) for op in self._logical_plan.sources())

def is_read_only(self, root_op: Optional[LogicalOperator] = None) -> bool:
"""Return whether the LogicalPlan corresponding to `root_op`
contains only a Read op. By default, the last operator of
the LogicalPlan is used."""
if root_op is None:
root_op = self._logical_plan.dag

return root_op.is_read_op()

def has_computed_output(self) -> bool:
"""Whether this plan has a computed snapshot for the final operator, i.e. for
the output of this plan.
Expand Down
44 changes: 4 additions & 40 deletions python/ray/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5008,16 +5008,7 @@ def to_tf(
>>> import ray
>>> ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv")
>>> ds
Dataset(
num_rows=?,
schema={
sepal length (cm): double,
sepal width (cm): double,
petal length (cm): double,
petal width (cm): double,
target: int64
}
)
Dataset(num_rows=?, schema=...)

If your model accepts a single tensor as input, specify a single feature column.

Expand All @@ -5039,16 +5030,7 @@ def to_tf(
>>> ds = preprocessor.transform(ds)
>>> ds
Concatenator
+- Dataset(
num_rows=?,
schema={
sepal length (cm): double,
sepal width (cm): double,
petal length (cm): double,
petal width (cm): double,
target: int64
}
)
+- Dataset(num_rows=?, schema=...)
>>> ds.to_tf("features", "target")
<_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))>

Expand Down Expand Up @@ -5753,16 +5735,7 @@ def serialize_lineage(self) -> bytes:

.. testoutput::

Dataset(
num_rows=?,
schema={
sepal length (cm): double,
sepal width (cm): double,
petal length (cm): double,
petal width (cm): double,
target: int64
}
)
Dataset(num_rows=?, schema=...)


Returns:
Expand Down Expand Up @@ -5835,16 +5808,7 @@ def deserialize_lineage(serialized_ds: bytes) -> "Dataset":

.. testoutput::

Dataset(
num_rows=?,
schema={
sepal length (cm): double,
sepal width (cm): double,
petal length (cm): double,
petal width (cm): double,
target: int64
}
)
Dataset(num_rows=?, schema=...)

Args:
serialized_ds: The serialized Dataset that we wish to deserialize.
Expand Down
22 changes: 2 additions & 20 deletions python/ray/data/iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,16 +681,7 @@ def to_tf(
... "s3://anonymous@air-example-data/iris.csv"
... )
>>> it = ds.iterator(); it
DataIterator(Dataset(
num_rows=?,
schema={
sepal length (cm): double,
sepal width (cm): double,
petal length (cm): double,
petal width (cm): double,
target: int64
}
))
DataIterator(Dataset(num_rows=?, schema=...))

If your model accepts a single tensor as input, specify a single feature column.

Expand All @@ -712,16 +703,7 @@ def to_tf(
>>> it = preprocessor.transform(ds).iterator()
>>> it
DataIterator(Concatenator
+- Dataset(
num_rows=?,
schema={
sepal length (cm): double,
sepal width (cm): double,
petal length (cm): double,
petal width (cm): double,
target: int64
}
))
+- Dataset(num_rows=?, schema=...))
>>> it.to_tf("features", "target")
<_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))>

Expand Down
12 changes: 3 additions & 9 deletions python/ray/data/read_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1499,7 +1499,7 @@ def read_csv(

>>> ray.data.read_csv("s3://anonymous@ray-example-data/different-extensions/",
... file_extensions=["csv"])
Dataset(num_rows=?, schema={a: int64, b: int64})
Dataset(num_rows=?, schema=...)

Args:
paths: A single file or directory, or a list of file or directory paths.
Expand Down Expand Up @@ -1944,10 +1944,7 @@ def read_tfrecords(
Examples:
>>> import ray
>>> ray.data.read_tfrecords("s3://anonymous@ray-example-data/iris.tfrecords")
Dataset(
num_rows=?,
schema={...}
)
Dataset(num_rows=?, schema=...)

We can also read compressed TFRecord files, which use one of the
`compression types supported by Arrow <https://arrow.apache.org/docs/python/\
Expand All @@ -1957,10 +1954,7 @@ def read_tfrecords(
... "s3://anonymous@ray-example-data/iris.tfrecords.gz",
... arrow_open_stream_args={"compression": "gzip"},
... )
Dataset(
num_rows=?,
schema={...}
)
Dataset(num_rows=?, schema=...)

Args:
paths: A single file or directory, or a list of file or directory paths.
Expand Down
8 changes: 1 addition & 7 deletions python/ray/data/tests/test_mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,13 +251,7 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo):
collection=foo_collection,
override_num_blocks=1000,
)
assert str(ds) == (
"Dataset(\n"
" num_rows=5,\n"
" schema={_id: fixed_size_binary[12], float_field: double, "
"int_field: int32}\n"
")"
)
assert str(ds) == ("Dataset(num_rows=5, schema=Unknown schema)")
assert df.equals(ds.drop_columns(["_id"]).to_pandas())

# Read a subset of the collection.
Expand Down
5 changes: 2 additions & 3 deletions python/ray/data/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,10 +394,9 @@ def test_parquet_read_bulk(ray_start_regular_shared, fs, data_path):
assert "test1.parquet" in str(input_files)
assert "test2.parquet" in str(input_files)
assert not ds._plan.has_started_execution
assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()}))

# Schema isn't available, so we do a partial read.
assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()}))
assert ds._plan.has_started_execution
assert not ds._plan.has_computed_output()

# Forces a data read.
Expand Down Expand Up @@ -477,7 +476,7 @@ def test_parquet_read_bulk_meta_provider(ray_start_regular_shared, fs, data_path
assert ds.count() == 6
assert ds.size_bytes() > 0
assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()}))
assert ds._plan.has_started_execution
assert not ds._plan.has_started_execution

# Forces a data read.
values = [[s["one"], s["two"]] for s in ds.take()]
Expand Down