From d7a4f76e5fe7a2a878d84b5574187bf48d8f528f Mon Sep 17 00:00:00 2001 From: will Date: Mon, 22 Dec 2025 19:53:03 +0800 Subject: [PATCH] [Data]Move extension types to ray.data ray.air.util.objection_extensions -> ray.data._internal.object_extensions ray.air.util.tensor_extensions -> ray.data._internal.tensor_extensions Signed-off-by: will --- python/ray/air/_internal/tensorflow_utils.py | 68 +- python/ray/air/util/data_batch_conversion.py | 8 +- python/ray/data/__init__.py | 2 +- python/ray/data/_internal/arrow_block.py | 12 +- .../_internal/arrow_ops/transform_pyarrow.py | 18 +- .../datasource/huggingface_datasource.py | 2 +- .../_internal/datasource/json_datasource.py | 2 +- .../datasource/tfrecords_datasource.py | 2 +- .../_internal/execution/operators/join.py | 2 +- python/ray/data/_internal/numpy_support.py | 2 +- .../_internal}/object_extensions/__init__.py | 0 .../_internal}/object_extensions/arrow.py | 4 +- .../_internal}/object_extensions/pandas.py | 4 +- python/ray/data/_internal/pandas_block.py | 18 +- .../_internal/planner/exchange/interfaces.py | 2 +- python/ray/data/_internal/table_block.py | 2 +- .../_internal}/tensor_extensions/__init__.py | 0 .../_internal}/tensor_extensions/arrow.py | 22 +- .../_internal}/tensor_extensions/pandas.py | 23 +- .../_internal}/tensor_extensions/utils.py | 2 +- python/ray/data/_internal/utils/__init__.py | 0 .../data/_internal/utils/tensorflow_utils.py | 139 ++++ .../_internal/utils}/transform_pyarrow.py | 7 +- python/ray/data/block.py | 4 +- python/ray/data/collate_fn.py | 3 +- python/ray/data/constants.py | 6 + python/ray/data/dataset.py | 8 +- python/ray/data/datatype.py | 10 +- python/ray/data/extensions/__init__.py | 2 +- .../ray/data/extensions/object_extension.py | 4 +- .../ray/data/extensions/tensor_extension.py | 8 +- python/ray/data/iterator.py | 8 +- python/ray/data/preprocessor.py | 4 +- python/ray/data/preprocessors/chain.py | 2 +- python/ray/data/preprocessors/encoder.py | 2 +- python/ray/data/preprocessors/torch.py | 6 +- python/ray/data/read_api.py | 4 +- python/ray/data/stats.py | 2 +- python/ray/data/tests/conftest.py | 4 +- .../data/tests/preprocessors/test_chain.py | 2 +- .../tests/preprocessors/test_preprocessors.py | 4 +- python/ray/data/tests/test_arrow_block.py | 6 +- python/ray/data/tests/test_daft.py | 4 +- python/ray/data/tests/test_ecosystem_dask.py | 2 +- python/ray/data/tests/test_image.py | 6 +- python/ray/data/tests/test_numpy.py | 2 +- python/ray/data/tests/test_numpy_support.py | 2 +- python/ray/data/tests/test_pandas.py | 7 +- python/ray/data/tests/test_parquet.py | 8 +- python/ray/data/tests/test_strict_mode.py | 4 +- python/ray/data/tests/test_tensor.py | 4 +- .../ray/data/tests/test_tensor_extension.py | 8 +- .../ray/data/tests/test_torch_tensor_utils.py | 2 +- .../ray/data/tests/test_transform_pyarrow.py | 12 +- .../tests/unit/test_arrow_type_conversion.py | 8 +- .../tests/unit/test_data_batch_conversion.py | 10 +- .../data/tests/unit/test_object_extension.py | 4 +- python/ray/data/util/__init__.py | 0 python/ray/data/util/data_batch_conversion.py | 353 ++++++++++ python/ray/data/util/torch_utils.py | 618 ++++++++++++++++++ .../tests/test_torch_detection_predictor.py | 2 +- 61 files changed, 1281 insertions(+), 205 deletions(-) rename python/ray/{air/util => data/_internal}/object_extensions/__init__.py (100%) rename python/ray/{air/util => data/_internal}/object_extensions/arrow.py (97%) rename python/ray/{air/util => data/_internal}/object_extensions/pandas.py (96%) rename python/ray/{air/util => data/_internal}/tensor_extensions/__init__.py (100%) rename python/ray/{air/util => data/_internal}/tensor_extensions/arrow.py (99%) rename python/ray/{air/util => data/_internal}/tensor_extensions/pandas.py (98%) rename python/ray/{air/util => data/_internal}/tensor_extensions/utils.py (99%) create mode 100644 python/ray/data/_internal/utils/__init__.py create mode 100644 python/ray/data/_internal/utils/tensorflow_utils.py rename python/ray/{air/util => data/_internal/utils}/transform_pyarrow.py (88%) create mode 100644 python/ray/data/constants.py create mode 100644 python/ray/data/util/__init__.py create mode 100644 python/ray/data/util/data_batch_conversion.py create mode 100644 python/ray/data/util/torch_utils.py diff --git a/python/ray/air/_internal/tensorflow_utils.py b/python/ray/air/_internal/tensorflow_utils.py index 46b3c2d1d1b7..1f382f9962c3 100644 --- a/python/ray/air/_internal/tensorflow_utils.py +++ b/python/ray/air/_internal/tensorflow_utils.py @@ -1,14 +1,9 @@ -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import Dict, Optional, Union import numpy as np -import pyarrow import tensorflow as tf from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed -from ray.air.util.tensor_extensions.arrow import get_arrow_extension_tensor_types - -if TYPE_CHECKING: - from ray.data._internal.pandas_block import PandasBlockSchema def convert_ndarray_to_tf_tensor( @@ -74,64 +69,3 @@ def convert_ndarray_batch_to_tf_tensor_batch( } return batch - - -def get_type_spec( - schema: Union["pyarrow.lib.Schema", "PandasBlockSchema"], - columns: Union[str, List[str]], -) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]: - import pyarrow as pa - - from ray.data.extensions import TensorDtype - - tensor_extension_types = get_arrow_extension_tensor_types() - - assert not isinstance(schema, type) - - dtypes: Dict[str, Union[np.dtype, pa.DataType]] = dict( - zip(schema.names, schema.types) - ) - - def get_dtype(dtype: Union[np.dtype, pa.DataType]) -> tf.dtypes.DType: - if isinstance(dtype, pa.ListType): - dtype = dtype.value_type - if isinstance(dtype, pa.DataType): - dtype = dtype.to_pandas_dtype() - if isinstance(dtype, TensorDtype): - dtype = dtype.element_dtype - res = tf.dtypes.as_dtype(dtype) - return res - - def get_shape(dtype: Union[np.dtype, pa.DataType]) -> Tuple[int, ...]: - shape = (None,) - if isinstance(dtype, tensor_extension_types): - dtype = dtype.to_pandas_dtype() - if isinstance(dtype, pa.ListType): - shape += (None,) - elif isinstance(dtype, TensorDtype): - shape += dtype.element_shape - return shape - - def get_tensor_spec( - dtype: Union[np.dtype, pa.DataType], *, name: str - ) -> tf.TypeSpec: - - shape, dtype = get_shape(dtype), get_dtype(dtype) - # Batch dimension is always `None`. So, if there's more than one `None`-valued - # dimension, then the tensor is ragged. - is_ragged = sum(dim is None for dim in shape) > 1 - if is_ragged: - type_spec = tf.RaggedTensorSpec(shape, dtype=dtype) - else: - type_spec = tf.TensorSpec(shape, dtype=dtype, name=name) - return type_spec - - if isinstance(columns, str): - name, dtype = columns, dtypes[columns] - return get_tensor_spec(dtype, name=name) - - return { - name: get_tensor_spec(dtype, name=name) - for name, dtype in dtypes.items() - if name in columns - } diff --git a/python/ray/air/util/data_batch_conversion.py b/python/ray/air/util/data_batch_conversion.py index 1bf69b4b9398..e1e5e31f305d 100644 --- a/python/ray/air/util/data_batch_conversion.py +++ b/python/ray/air/util/data_batch_conversion.py @@ -217,10 +217,10 @@ def _convert_batch_type_to_numpy( ) return data elif pyarrow is not None and isinstance(data, pyarrow.Table): - from ray.air.util.tensor_extensions.arrow import ( + from ray.data._internal.arrow_ops import transform_pyarrow + from ray.data._internal.tensor_extensions.arrow import ( get_arrow_extension_fixed_shape_tensor_types, ) - from ray.data._internal.arrow_ops import transform_pyarrow column_values_ndarrays = [] @@ -292,7 +292,7 @@ def _cast_ndarray_columns_to_tensor_extension(df: "pd.DataFrame") -> "pd.DataFra # SettingWithCopyWarning was moved to pd.errors in Pandas 1.5.0. SettingWithCopyWarning = pd.errors.SettingWithCopyWarning - from ray.air.util.tensor_extensions.pandas import ( + from ray.data._internal.tensor_extensions.pandas import ( TensorArray, column_needs_tensor_extension, ) @@ -334,7 +334,7 @@ def _cast_tensor_columns_to_ndarrays(df: "pd.DataFrame") -> "pd.DataFrame": except AttributeError: # SettingWithCopyWarning was moved to pd.errors in Pandas 1.5.0. SettingWithCopyWarning = pd.errors.SettingWithCopyWarning - from ray.air.util.tensor_extensions.pandas import TensorDtype + from ray.data._internal.tensor_extensions.pandas import TensorDtype # Try to convert any tensor extension columns to ndarray columns. # TODO(Clark): Optimize this with propagated DataFrame metadata containing a list of diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py index 8fc8cf9477e7..6789e25b209c 100644 --- a/python/ray/data/__init__.py +++ b/python/ray/data/__init__.py @@ -91,7 +91,7 @@ import pyarrow as pa # Import these arrow extension types to ensure that they are registered. - from ray.air.util.tensor_extensions.arrow import ( # noqa + from ray.data._internal.tensor_extensions.arrow import ( # noqa ArrowTensorType, ArrowVariableShapedTensorType, ) diff --git a/python/ray/data/_internal/arrow_block.py b/python/ray/data/_internal/arrow_block.py index 669464b65510..919d96606309 100644 --- a/python/ray/data/_internal/arrow_block.py +++ b/python/ray/data/_internal/arrow_block.py @@ -19,15 +19,14 @@ from ray._private.arrow_utils import get_pyarrow_version from ray._private.ray_constants import env_integer -from ray.air.constants import TENSOR_COLUMN_NAME -from ray.air.util.tensor_extensions.arrow import ( - convert_to_pyarrow_array, - pyarrow_table_from_pydict, -) from ray.data._internal.arrow_ops import transform_polars, transform_pyarrow from ray.data._internal.arrow_ops.transform_pyarrow import shuffle from ray.data._internal.row import row_repr, row_repr_pretty, row_str from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder +from ray.data._internal.tensor_extensions.arrow import ( + convert_to_pyarrow_array, + pyarrow_table_from_pydict, +) from ray.data.block import ( Block, BlockAccessor, @@ -38,6 +37,7 @@ BlockType, U, ) +from ray.data.constants import TENSOR_COLUMN_NAME from ray.data.context import DEFAULT_TARGET_MAX_BLOCK_SIZE, DataContext from ray.data.expressions import Expr @@ -272,7 +272,7 @@ def schema(self) -> "pyarrow.lib.Schema": return self._table.schema def to_pandas(self) -> "pandas.DataFrame": - from ray.air.util.data_batch_conversion import _cast_tensor_columns_to_ndarrays + from ray.data.util.data_batch_conversion import _cast_tensor_columns_to_ndarrays # We specify ignore_metadata=True because pyarrow will use the metadata # to build the Table. This is handled incorrectly for older pyarrow versions diff --git a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py index cb66568b48d3..4fa5602e664d 100644 --- a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py +++ b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py @@ -9,7 +9,7 @@ from ray._private.arrow_utils import get_pyarrow_version from ray._private.ray_constants import env_integer from ray._private.utils import INT32_MAX -from ray.air.util.tensor_extensions.arrow import ( +from ray.data._internal.tensor_extensions.arrow import ( MIN_PYARROW_VERSION_CHUNKED_ARRAY_TO_NUMPY_ZERO_COPY_ONLY, PYARROW_VERSION, get_arrow_extension_fixed_shape_tensor_types, @@ -143,7 +143,7 @@ def take_table( extension arrays. This is exposed as a static method for easier use on intermediate tables, not underlying an ArrowBlockAccessor. """ - from ray.air.util.transform_pyarrow import ( + from ray.data._internal.utils.transform_pyarrow import ( _concatenate_extension_column, _is_pa_extension_type, ) @@ -176,7 +176,7 @@ def _reconcile_diverging_fields( Returns: A dictionary of diverging fields with their reconciled types. """ - from ray.air.util.object_extensions.arrow import ArrowPythonObjectType + from ray.data._internal.object_extensions.arrow import ArrowPythonObjectType reconciled_fields = {} field_types = defaultdict(list) # field_name -> list of types seen so far @@ -232,8 +232,8 @@ def _reconcile_field( Returns reconciled type or None if default PyArrow handling is sufficient. """ - from ray.air.util.object_extensions.arrow import ArrowPythonObjectType - from ray.air.util.tensor_extensions.arrow import ( + from ray.data._internal.object_extensions.arrow import ArrowPythonObjectType + from ray.data._internal.tensor_extensions.arrow import ( get_arrow_extension_tensor_types, ) @@ -431,7 +431,7 @@ def _backfill_missing_fields( """ import pyarrow as pa - from ray.air.util.tensor_extensions.arrow import ( + from ray.data._internal.tensor_extensions.arrow import ( ArrowVariableShapedTensorType, ) @@ -690,7 +690,7 @@ def concat( """ import pyarrow as pa - from ray.air.util.tensor_extensions.arrow import ArrowConversionError + from ray.data._internal.tensor_extensions.arrow import ArrowConversionError from ray.data.extensions import ( ArrowPythonObjectType, get_arrow_extension_tensor_types, @@ -910,7 +910,7 @@ def combine_chunked_array( import pyarrow as pa - from ray.air.util.transform_pyarrow import ( + from ray.data._internal.utils.transform_pyarrow import ( _concatenate_extension_column, _is_pa_extension_type, ) @@ -993,7 +993,7 @@ def _try_combine_chunks_safe( import pyarrow as pa - from ray.air.util.transform_pyarrow import _is_pa_extension_type + from ray.data._internal.utils.transform_pyarrow import _is_pa_extension_type assert not _is_pa_extension_type( array.type diff --git a/python/ray/data/_internal/datasource/huggingface_datasource.py b/python/ray/data/_internal/datasource/huggingface_datasource.py index b015ad46db73..6962ee2e8a04 100644 --- a/python/ray/data/_internal/datasource/huggingface_datasource.py +++ b/python/ray/data/_internal/datasource/huggingface_datasource.py @@ -1,7 +1,7 @@ import sys from typing import TYPE_CHECKING, Iterable, List, Optional, Union -from ray.air.util.tensor_extensions.arrow import pyarrow_table_from_pydict +from ray.data._internal.tensor_extensions.arrow import pyarrow_table_from_pydict from ray.data._internal.util import _check_pyarrow_version from ray.data.block import Block, BlockAccessor, BlockMetadata from ray.data.dataset import Dataset diff --git a/python/ray/data/_internal/datasource/json_datasource.py b/python/ray/data/_internal/datasource/json_datasource.py index 7dc1d9c6a85a..df79d174055b 100644 --- a/python/ray/data/_internal/datasource/json_datasource.py +++ b/python/ray/data/_internal/datasource/json_datasource.py @@ -4,8 +4,8 @@ import pandas as pd -from ray.air.util.tensor_extensions.arrow import pyarrow_table_from_pydict from ray.data._internal.pandas_block import PandasBlockAccessor +from ray.data._internal.tensor_extensions.arrow import pyarrow_table_from_pydict from ray.data.context import DataContext from ray.data.datasource.file_based_datasource import FileBasedDatasource diff --git a/python/ray/data/_internal/datasource/tfrecords_datasource.py b/python/ray/data/_internal/datasource/tfrecords_datasource.py index 925f45657e1d..bd69fae531a8 100644 --- a/python/ray/data/_internal/datasource/tfrecords_datasource.py +++ b/python/ray/data/_internal/datasource/tfrecords_datasource.py @@ -5,7 +5,7 @@ import pyarrow -from ray.air.util.tensor_extensions.arrow import pyarrow_table_from_pydict +from ray.data._internal.tensor_extensions.arrow import pyarrow_table_from_pydict from ray.data.aggregate import AggregateFn from ray.data.block import Block from ray.data.datasource.file_based_datasource import FileBasedDatasource diff --git a/python/ray/data/_internal/execution/operators/join.py b/python/ray/data/_internal/execution/operators/join.py index 74f5897bd789..45e158815ed5 100644 --- a/python/ray/data/_internal/execution/operators/join.py +++ b/python/ray/data/_internal/execution/operators/join.py @@ -4,7 +4,6 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type from ray._private.arrow_utils import get_pyarrow_version -from ray.air.util.transform_pyarrow import _is_pa_extension_type from ray.data._internal.arrow_block import ArrowBlockAccessor, ArrowBlockBuilder from ray.data._internal.arrow_ops.transform_pyarrow import ( MIN_PYARROW_VERSION_RUN_END_ENCODED_TYPES, @@ -17,6 +16,7 @@ ) from ray.data._internal.logical.operators.join_operator import JoinType from ray.data._internal.util import GiB, MiB +from ray.data._internal.utils.transform_pyarrow import _is_pa_extension_type from ray.data.block import Block from ray.data.context import DataContext diff --git a/python/ray/data/_internal/numpy_support.py b/python/ray/data/_internal/numpy_support.py index 69a6b67e7e61..3fff603f8e2f 100644 --- a/python/ray/data/_internal/numpy_support.py +++ b/python/ray/data/_internal/numpy_support.py @@ -4,7 +4,7 @@ import numpy as np -from ray.air.util.tensor_extensions.utils import ( +from ray.data._internal.tensor_extensions.utils import ( create_ragged_ndarray, is_ndarray_like, ) diff --git a/python/ray/air/util/object_extensions/__init__.py b/python/ray/data/_internal/object_extensions/__init__.py similarity index 100% rename from python/ray/air/util/object_extensions/__init__.py rename to python/ray/data/_internal/object_extensions/__init__.py diff --git a/python/ray/air/util/object_extensions/arrow.py b/python/ray/data/_internal/object_extensions/arrow.py similarity index 97% rename from python/ray/air/util/object_extensions/arrow.py rename to python/ray/data/_internal/object_extensions/arrow.py index 1f78d72e0346..8c93a3c41da9 100644 --- a/python/ray/air/util/object_extensions/arrow.py +++ b/python/ray/data/_internal/object_extensions/arrow.py @@ -5,7 +5,7 @@ import pyarrow as pa from packaging.version import parse as parse_version -import ray.air.util.object_extensions.pandas +import ray.data._internal.object_extensions.pandas from ray._common.serialization import pickle_dumps from ray._private.arrow_utils import _check_pyarrow_version, get_pyarrow_version from ray.util.annotations import PublicAPI @@ -67,7 +67,7 @@ def to_pandas_dtype(self): to the Arrow type. See https://pandas.pydata.org/docs/development/extending.html for more information. """ - return ray.air.util.object_extensions.pandas.PythonObjectDtype() + return ray.data._internal.object_extensions.pandas.PythonObjectDtype() def __reduce__(self): # Earlier PyArrow versions require custom pickling behavior. diff --git a/python/ray/air/util/object_extensions/pandas.py b/python/ray/data/_internal/object_extensions/pandas.py similarity index 96% rename from python/ray/air/util/object_extensions/pandas.py rename to python/ray/data/_internal/object_extensions/pandas.py index dbc5732f350b..9ad9634cc4a6 100644 --- a/python/ray/air/util/object_extensions/pandas.py +++ b/python/ray/data/_internal/object_extensions/pandas.py @@ -7,7 +7,7 @@ from pandas._libs import lib from pandas._typing import ArrayLike, Dtype, PositionalIndexer, TakeIndexer, npt -import ray.air.util.object_extensions.arrow +import ray.data._internal.object_extensions.arrow from ray.util.annotations import PublicAPI @@ -76,7 +76,7 @@ def nbytes(self) -> int: return self.values.nbytes def __arrow_array__(self, type=None): - return ray.air.util.object_extensions.arrow.ArrowPythonObjectArray.from_objects( + return ray.data._internal.object_extensions.arrow.ArrowPythonObjectArray.from_objects( self.values ) diff --git a/python/ray/data/_internal/pandas_block.py b/python/ray/data/_internal/pandas_block.py index b8317ecab9b7..086e2121b87c 100644 --- a/python/ray/data/_internal/pandas_block.py +++ b/python/ray/data/_internal/pandas_block.py @@ -18,11 +18,10 @@ import pandas as pd from pandas.api.types import is_object_dtype, is_scalar, is_string_dtype -from ray.air.constants import TENSOR_COLUMN_NAME -from ray.air.util.tensor_extensions.utils import _should_convert_to_tensor from ray.data._internal.numpy_support import convert_to_numpy from ray.data._internal.row import row_repr, row_repr_pretty, row_str from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder +from ray.data._internal.tensor_extensions.utils import _should_convert_to_tensor from ray.data._internal.util import is_null from ray.data.block import ( Block, @@ -33,6 +32,7 @@ BlockType, U, ) +from ray.data.constants import TENSOR_COLUMN_NAME from ray.data.context import DataContext from ray.data.expressions import Expr @@ -197,7 +197,7 @@ def value_counts(self) -> Optional[Dict[str, List]]: def hash(self) -> BlockColumn: - from ray.air.util.tensor_extensions.pandas import TensorArrayElement + from ray.data._internal.tensor_extensions.pandas import TensorArrayElement first_non_null = next((x for x in self._column if x is not None), None) if isinstance(first_non_null, TensorArrayElement): @@ -232,7 +232,7 @@ def unique(self) -> BlockColumn: raise def flatten(self) -> BlockColumn: - from ray.air.util.tensor_extensions.pandas import TensorArrayElement + from ray.data._internal.tensor_extensions.pandas import TensorArrayElement first_non_null = next((x for x in self._column if x is not None), None) if not isinstance(first_non_null, TensorArrayElement): @@ -284,7 +284,7 @@ def _is_all_null(self): return not self._column.notna().any() def is_composed_of_lists(self) -> bool: - from ray.air.util.tensor_extensions.pandas import TensorArrayElement + from ray.data._internal.tensor_extensions.pandas import TensorArrayElement types = (list, np.ndarray, TensorArrayElement) first_non_null = next((x for x in self._column if x is not None), None) @@ -317,7 +317,7 @@ def _table_from_pydict(columns: Dict[str, List[Any]]) -> "pandas.DataFrame": @staticmethod def _combine_tables(tables: List["pandas.DataFrame"]) -> "pandas.DataFrame": pandas = lazy_import_pandas() - from ray.air.util.data_batch_conversion import ( + from ray.data.util.data_batch_conversion import ( _cast_ndarray_columns_to_tensor_extension, ) @@ -439,7 +439,7 @@ def schema(self) -> PandasBlockSchema: return schema def to_pandas(self) -> "pandas.DataFrame": - from ray.air.util.data_batch_conversion import _cast_tensor_columns_to_ndarrays + from ray.data.util.data_batch_conversion import _cast_tensor_columns_to_ndarrays ctx = DataContext.get_current() table = self._table @@ -480,7 +480,7 @@ def to_numpy( def to_arrow(self) -> "pyarrow.Table": import pyarrow as pa - from ray.air.util.tensor_extensions.pandas import TensorDtype + from ray.data._internal.tensor_extensions.pandas import TensorDtype # Set `preserve_index=False` so that Arrow doesn't add a '__index_level_0__' # column to the resulting table. @@ -521,7 +521,7 @@ def num_rows(self) -> int: return self._table.shape[0] def size_bytes(self) -> int: - from ray.air.util.tensor_extensions.pandas import TensorArray + from ray.data._internal.tensor_extensions.pandas import TensorArray from ray.data.extensions import TensorArrayElement, TensorDtype pd = lazy_import_pandas() diff --git a/python/ray/data/_internal/planner/exchange/interfaces.py b/python/ray/data/_internal/planner/exchange/interfaces.py index 84388543509f..b55322f59deb 100644 --- a/python/ray/data/_internal/planner/exchange/interfaces.py +++ b/python/ray/data/_internal/planner/exchange/interfaces.py @@ -2,12 +2,12 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import ray._private.worker -from ray.air.util.data_batch_conversion import BatchFormat from ray.data._internal.execution.interfaces import RefBundle from ray.data._internal.stats import StatsDict from ray.data._internal.util import convert_bytes_to_human_readable_str from ray.data.block import Block, BlockType from ray.data.context import DataContext +from ray.data.util.data_batch_conversion import BatchFormat if TYPE_CHECKING: diff --git a/python/ray/data/_internal/table_block.py b/python/ray/data/_internal/table_block.py index 3bf1a88f305d..d3f890dd24e7 100644 --- a/python/ray/data/_internal/table_block.py +++ b/python/ray/data/_internal/table_block.py @@ -17,7 +17,6 @@ import numpy as np from ray._private.ray_constants import env_integer -from ray.air.constants import TENSOR_COLUMN_NAME from ray.data._internal.block_builder import BlockBuilder from ray.data._internal.size_estimator import SizeEstimator from ray.data._internal.util import ( @@ -36,6 +35,7 @@ KeyType, U, ) +from ray.data.constants import TENSOR_COLUMN_NAME from ray.data.context import DEFAULT_TARGET_MAX_BLOCK_SIZE if TYPE_CHECKING: diff --git a/python/ray/air/util/tensor_extensions/__init__.py b/python/ray/data/_internal/tensor_extensions/__init__.py similarity index 100% rename from python/ray/air/util/tensor_extensions/__init__.py rename to python/ray/data/_internal/tensor_extensions/__init__.py diff --git a/python/ray/air/util/tensor_extensions/arrow.py b/python/ray/data/_internal/tensor_extensions/arrow.py similarity index 99% rename from python/ray/air/util/tensor_extensions/arrow.py rename to python/ray/data/_internal/tensor_extensions/arrow.py index 60510aa12d8e..f1e665c8adf6 100644 --- a/python/ray/air/util/tensor_extensions/arrow.py +++ b/python/ray/data/_internal/tensor_extensions/arrow.py @@ -17,21 +17,21 @@ import ray.cloudpickle as cloudpickle from ray._private.arrow_utils import _check_pyarrow_version, get_pyarrow_version from ray._private.ray_constants import env_integer -from ray.air.util.object_extensions.arrow import ( +from ray.data._internal.numpy_support import ( + _convert_datetime_to_np_datetime, + convert_to_numpy, +) +from ray.data._internal.object_extensions.arrow import ( MIN_PYARROW_VERSION_SCALAR_SUBCLASS, ArrowPythonObjectArray, _object_extension_type_allowed, ) -from ray.air.util.tensor_extensions.utils import ( +from ray.data._internal.tensor_extensions.utils import ( ArrayLike, _is_ndarray_variable_shaped_tensor, _should_convert_to_tensor, create_ragged_ndarray, ) -from ray.data._internal.numpy_support import ( - _convert_datetime_to_np_datetime, - convert_to_numpy, -) from ray.util import log_once from ray.util.annotations import DeveloperAPI, PublicAPI from ray.util.common import INT32_MAX @@ -197,6 +197,12 @@ def pyarrow_table_from_pydict( """ Convert a Python dictionary to a pyarrow Table. + Args: + pydict: The Python dictionary to convert. + + Returns: + The converted pyarrow Table. + Raises: ArrowConversionError: if the conversion fails. """ @@ -562,7 +568,7 @@ def to_pandas_dtype(self): Returns: An instance of pd.api.extensions.ExtensionDtype. """ - from ray.air.util.tensor_extensions.pandas import TensorDtype + from ray.data._internal.tensor_extensions.pandas import TensorDtype return TensorDtype(self._shape, self.scalar_type.to_pandas_dtype()) @@ -1042,7 +1048,7 @@ def to_pandas_dtype(self): Returns: An instance of pd.api.extensions.ExtensionDtype. """ - from ray.air.util.tensor_extensions.pandas import TensorDtype + from ray.data._internal.tensor_extensions.pandas import TensorDtype return TensorDtype( self.shape, diff --git a/python/ray/air/util/tensor_extensions/pandas.py b/python/ray/data/_internal/tensor_extensions/pandas.py similarity index 98% rename from python/ray/air/util/tensor_extensions/pandas.py rename to python/ray/data/_internal/tensor_extensions/pandas.py index 62245cf05884..07f8fd3c34d0 100644 --- a/python/ray/air/util/tensor_extensions/pandas.py +++ b/python/ray/data/_internal/tensor_extensions/pandas.py @@ -43,7 +43,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.indexers import check_array_indexer, validate_indices -from ray.air.util.tensor_extensions.utils import ( +from ray.data._internal.tensor_extensions.utils import ( _create_possibly_ragged_ndarray, _is_ndarray_variable_shaped_tensor, ) @@ -292,6 +292,13 @@ class TensorDtype(pd.api.extensions.ExtensionDtype): base = None def __init__(self, shape: Tuple[Optional[int], ...], dtype: np.dtype): + """ + Create a new TensorDtype. + + Args: + shape: The shape of the tensor elements. + dtype: The dtype of the tensor elements. + """ self._shape = shape self._dtype = dtype @@ -731,10 +738,14 @@ def __init__( Any, ], ): - """ + """Initialize a TensorArray from a sequence of ndarrays. + Args: - values: A NumPy ndarray or sequence of NumPy ndarrays of equal - shape. + values: (Union[np.ndarray, ABCSeries, Sequence[Union[np.ndarray, TensorArrayElement]], + TensorArrayElement, Any]): A NumPy ndarray or sequence of NumPy ndarrays of equal shape. + + Raises: + TypeError: If values is not a numpy.ndarray or sequence of numpy.ndarray. """ # Try to convert some well-known objects to ndarrays before handing off to # ndarray handling logic. @@ -1018,7 +1029,7 @@ def take( extension array to object dtype. This uses the helper method :func:`pandas.api.extensions.take`. - .. code-block:: python + .. testcode:: def take(self, indices, allow_fill=False, fill_value=None): from pandas.core.algorithms import take @@ -1392,7 +1403,7 @@ def __arrow_array__(self, type=None): https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow for more information. """ - from ray.air.util.tensor_extensions.arrow import ( + from ray.data._internal.tensor_extensions.arrow import ( ArrowTensorArray, ArrowVariableShapedTensorArray, ) diff --git a/python/ray/air/util/tensor_extensions/utils.py b/python/ray/data/_internal/tensor_extensions/utils.py similarity index 99% rename from python/ray/air/util/tensor_extensions/utils.py rename to python/ray/data/_internal/tensor_extensions/utils.py index 6ac182c9d117..b93526607736 100644 --- a/python/ray/air/util/tensor_extensions/utils.py +++ b/python/ray/data/_internal/tensor_extensions/utils.py @@ -10,7 +10,7 @@ import numpy as np -from ray.air.constants import TENSOR_COLUMN_NAME +from ray.data.constants import TENSOR_COLUMN_NAME from ray.util import PublicAPI from ray.util.annotations import DeveloperAPI diff --git a/python/ray/data/_internal/utils/__init__.py b/python/ray/data/_internal/utils/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/ray/data/_internal/utils/tensorflow_utils.py b/python/ray/data/_internal/utils/tensorflow_utils.py new file mode 100644 index 000000000000..2839f6f4328a --- /dev/null +++ b/python/ray/data/_internal/utils/tensorflow_utils.py @@ -0,0 +1,139 @@ +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union + +import numpy as np +import pyarrow +import tensorflow as tf + +from ray.data._internal.tensor_extensions.arrow import get_arrow_extension_tensor_types +from ray.data.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + +if TYPE_CHECKING: + from ray.data._internal.pandas_block import PandasBlockSchema + + +def convert_ndarray_to_tf_tensor( + ndarray: np.ndarray, + dtype: Optional[tf.dtypes.DType] = None, + type_spec: Optional[tf.TypeSpec] = None, +) -> tf.Tensor: + """Convert a NumPy ndarray to a TensorFlow Tensor. + + Args: + ndarray: A NumPy ndarray that we wish to convert to a TensorFlow Tensor. + dtype: A TensorFlow dtype for the created tensor; if None, the dtype will be + inferred from the NumPy ndarray data. + type_spec: A type spec that specifies the shape and dtype of the returned + tensor. If you specify ``dtype``, the dtype stored in the type spec is + ignored. + + Returns: + A TensorFlow Tensor. + """ + if dtype is None and type_spec is not None: + dtype = type_spec.dtype + + is_ragged = isinstance(type_spec, tf.RaggedTensorSpec) + ndarray = _unwrap_ndarray_object_type_if_needed(ndarray) + if is_ragged: + return tf.ragged.constant(ndarray, dtype=dtype) + else: + return tf.convert_to_tensor(ndarray, dtype=dtype) + + +def convert_ndarray_batch_to_tf_tensor_batch( + ndarrays: Union[np.ndarray, Dict[str, np.ndarray]], + dtypes: Optional[Union[tf.dtypes.DType, Dict[str, tf.dtypes.DType]]] = None, +) -> Union[tf.Tensor, Dict[str, tf.Tensor]]: + """Convert a NumPy ndarray batch to a TensorFlow Tensor batch. + + Args: + ndarrays: A (dict of) NumPy ndarray(s) that we wish to convert to a TensorFlow + Tensor. + dtypes: A (dict of) TensorFlow dtype(s) for the created tensor; if None, the + dtype will be inferred from the NumPy ndarray data. + + Returns: + A (dict of) TensorFlow Tensor(s). + """ + if isinstance(ndarrays, np.ndarray): + # Single-tensor case. + if isinstance(dtypes, dict): + if len(dtypes) != 1: + raise ValueError( + "When constructing a single-tensor batch, only a single dtype " + f"should be given, instead got: {dtypes}" + ) + dtypes = next(iter(dtypes.values())) + batch = convert_ndarray_to_tf_tensor(ndarrays, dtypes) + else: + # Multi-tensor case. + batch = { + col_name: convert_ndarray_to_tf_tensor( + col_ndarray, + dtype=dtypes[col_name] if isinstance(dtypes, dict) else dtypes, + ) + for col_name, col_ndarray in ndarrays.items() + } + + return batch + + +def get_type_spec( + schema: Union["pyarrow.lib.Schema", "PandasBlockSchema"], + columns: Union[str, List[str]], +) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]: + import pyarrow as pa + + from ray.data.extensions import TensorDtype + + tensor_extension_types = get_arrow_extension_tensor_types() + + assert not isinstance(schema, type) + + dtypes: Dict[str, Union[np.dtype, pa.DataType]] = dict( + zip(schema.names, schema.types) + ) + + def get_dtype(dtype: Union[np.dtype, pa.DataType]) -> tf.dtypes.DType: + if isinstance(dtype, pa.ListType): + dtype = dtype.value_type + if isinstance(dtype, pa.DataType): + dtype = dtype.to_pandas_dtype() + if isinstance(dtype, TensorDtype): + dtype = dtype.element_dtype + res = tf.dtypes.as_dtype(dtype) + return res + + def get_shape(dtype: Union[np.dtype, pa.DataType]) -> Tuple[int, ...]: + shape = (None,) + if isinstance(dtype, tensor_extension_types): + dtype = dtype.to_pandas_dtype() + if isinstance(dtype, pa.ListType): + shape += (None,) + elif isinstance(dtype, TensorDtype): + shape += dtype.element_shape + return shape + + def get_tensor_spec( + dtype: Union[np.dtype, pa.DataType], *, name: str + ) -> tf.TypeSpec: + + shape, dtype = get_shape(dtype), get_dtype(dtype) + # Batch dimension is always `None`. So, if there's more than one `None`-valued + # dimension, then the tensor is ragged. + is_ragged = sum(dim is None for dim in shape) > 1 + if is_ragged: + type_spec = tf.RaggedTensorSpec(shape, dtype=dtype) + else: + type_spec = tf.TensorSpec(shape, dtype=dtype, name=name) + return type_spec + + if isinstance(columns, str): + name, dtype = columns, dtypes[columns] + return get_tensor_spec(dtype, name=name) + + return { + name: get_tensor_spec(dtype, name=name) + for name, dtype in dtypes.items() + if name in columns + } diff --git a/python/ray/air/util/transform_pyarrow.py b/python/ray/data/_internal/utils/transform_pyarrow.py similarity index 88% rename from python/ray/air/util/transform_pyarrow.py rename to python/ray/data/_internal/utils/transform_pyarrow.py index 1617f04a6bfc..63a23b7f7780 100644 --- a/python/ray/air/util/transform_pyarrow.py +++ b/python/ray/data/_internal/utils/transform_pyarrow.py @@ -1,4 +1,4 @@ -from ray.air.util.tensor_extensions.arrow import concat_tensor_arrays +from ray.data._internal.tensor_extensions.arrow import concat_tensor_arrays try: import pyarrow @@ -25,8 +25,11 @@ def _concatenate_extension_column( Args: ca: The chunked array representing the extension column to be concatenated. ensure_copy: Skip copying when ensure_copy is False and there is exactly 1 chunk. + + Returns: + Array: the concatenate extension column. """ - from ray.air.util.tensor_extensions.arrow import ( + from ray.data._internal.tensor_extensions.arrow import ( get_arrow_extension_tensor_types, ) diff --git a/python/ray/data/block.py b/python/ray/data/block.py index 42c9525b0c4c..4a00b72daac9 100644 --- a/python/ray/data/block.py +++ b/python/ray/data/block.py @@ -470,7 +470,9 @@ def batch_to_block( elif isinstance(batch, collections.abc.Mapping): if block_type is None or block_type == BlockType.ARROW: - from ray.air.util.tensor_extensions.arrow import ArrowConversionError + from ray.data._internal.tensor_extensions.arrow import ( + ArrowConversionError, + ) try: return cls.batch_to_arrow_block(batch) diff --git a/python/ray/data/collate_fn.py b/python/ray/data/collate_fn.py index b4b91102c562..8f3b50ccff43 100644 --- a/python/ray/data/collate_fn.py +++ b/python/ray/data/collate_fn.py @@ -16,7 +16,6 @@ import numpy as np from ray._private.ray_constants import env_integer -from ray.data.block import DataBatch from ray.util.annotations import DeveloperAPI if TYPE_CHECKING: @@ -277,7 +276,7 @@ def __call__( Returns: Dictionary mapping column names to lists of tensors """ - from ray.air._internal.torch_utils import ( + from ray.data.util.torch_utils import ( arrow_batch_to_tensors, ) diff --git a/python/ray/data/constants.py b/python/ray/data/constants.py new file mode 100644 index 000000000000..c58662a1c786 --- /dev/null +++ b/python/ray/data/constants.py @@ -0,0 +1,6 @@ +# Name to use for the column when representing tensors in table format. +TENSOR_COLUMN_NAME = "__value__" + +# The maximum length of strings returned by `__repr__` for AIR objects constructed with +# default values. +MAX_REPR_LENGTH = int(80 * 1.5) diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index 99bdc0a2c818..fe5e7ae803a7 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -28,10 +28,6 @@ import ray.cloudpickle as pickle from ray._common.usage import usage_lib from ray._private.thirdparty.tabulate.tabulate import tabulate -from ray.air.util.tensor_extensions.arrow import ( - ArrowTensorTypeV2, - get_arrow_extension_fixed_shape_tensor_types, -) from ray.data._internal.compute import ComputeStrategy from ray.data._internal.datasource.bigquery_datasink import BigQueryDatasink from ray.data._internal.datasource.clickhouse_datasink import ( @@ -89,6 +85,10 @@ from ray.data._internal.remote_fn import cached_remote_fn from ray.data._internal.split import _get_num_rows, _split_at_indices from ray.data._internal.stats import DatasetStats, DatasetStatsSummary, _StatsManager +from ray.data._internal.tensor_extensions.arrow import ( + ArrowTensorTypeV2, + get_arrow_extension_fixed_shape_tensor_types, +) from ray.data._internal.util import ( AllToAllAPI, ConsumptionAPI, diff --git a/python/ray/data/datatype.py b/python/ray/data/datatype.py index d22faf4d456f..91f6294c4b97 100644 --- a/python/ray/data/datatype.py +++ b/python/ray/data/datatype.py @@ -5,7 +5,7 @@ import numpy as np import pyarrow as pa -from ray.air.util.tensor_extensions.arrow import ( +from ray.data._internal.tensor_extensions.arrow import ( _infer_pyarrow_type, ) from ray.util.annotations import PublicAPI @@ -475,7 +475,7 @@ def tensor( >>> DataType.tensor(shape=(3, 4), dtype=DataType.float32()) # doctest: +ELLIPSIS DataType(arrow:ArrowTensorType(...)) """ - from ray.air.util.tensor_extensions.arrow import ArrowTensorType + from ray.data._internal.tensor_extensions.arrow import ArrowTensorType element_arrow_type = dtype.to_arrow_dtype() return cls.from_arrow(ArrowTensorType(shape, element_arrow_type)) @@ -500,7 +500,9 @@ def variable_shaped_tensor( >>> DataType.variable_shaped_tensor(dtype=DataType.float32(), ndim=2) # doctest: +ELLIPSIS DataType(arrow:ArrowVariableShapedTensorType(...)) """ - from ray.air.util.tensor_extensions.arrow import ArrowVariableShapedTensorType + from ray.data._internal.tensor_extensions.arrow import ( + ArrowVariableShapedTensorType, + ) element_arrow_type = dtype.to_arrow_dtype() return cls.from_arrow(ArrowVariableShapedTensorType(element_arrow_type, ndim)) @@ -610,7 +612,7 @@ def is_tensor_type(self) -> bool: if not self.is_arrow_type(): return False - from ray.air.util.tensor_extensions.arrow import ( + from ray.data._internal.tensor_extensions.arrow import ( get_arrow_extension_tensor_types, ) diff --git a/python/ray/data/extensions/__init__.py b/python/ray/data/extensions/__init__.py index 517b4fe7a3a2..70a9640ed323 100644 --- a/python/ray/data/extensions/__init__.py +++ b/python/ray/data/extensions/__init__.py @@ -1,4 +1,4 @@ -from ray.air.util.tensor_extensions.arrow import ( +from ray.data._internal.tensor_extensions.arrow import ( ArrowTensorTypeV2, get_arrow_extension_tensor_types, ) diff --git a/python/ray/data/extensions/object_extension.py b/python/ray/data/extensions/object_extension.py index 42ab20a231c6..50a1389170bc 100644 --- a/python/ray/data/extensions/object_extension.py +++ b/python/ray/data/extensions/object_extension.py @@ -1,10 +1,10 @@ -from ray.air.util.object_extensions.arrow import ( # noqa: F401 +from ray.data._internal.object_extensions.arrow import ( # noqa: F401 ArrowPythonObjectArray, ArrowPythonObjectScalar, ArrowPythonObjectType, _object_extension_type_allowed, ) -from ray.air.util.object_extensions.pandas import ( # noqa: F401 +from ray.data._internal.object_extensions.pandas import ( # noqa: F401 PythonObjectArray, PythonObjectDtype, ) diff --git a/python/ray/data/extensions/tensor_extension.py b/python/ray/data/extensions/tensor_extension.py index 121685e4c5ad..892591d69fcb 100644 --- a/python/ray/data/extensions/tensor_extension.py +++ b/python/ray/data/extensions/tensor_extension.py @@ -1,4 +1,4 @@ -from ray.air.util.tensor_extensions.arrow import ( # noqa: F401 +from ray.data._internal.tensor_extensions.arrow import ( # noqa: F401 ArrowConversionError, ArrowTensorArray, ArrowTensorType, @@ -6,10 +6,12 @@ ArrowVariableShapedTensorArray, ArrowVariableShapedTensorType, ) -from ray.air.util.tensor_extensions.pandas import ( # noqa: F401 +from ray.data._internal.tensor_extensions.pandas import ( # noqa: F401 TensorArray, TensorArrayElement, TensorDtype, column_needs_tensor_extension, ) -from ray.air.util.tensor_extensions.utils import create_ragged_ndarray # noqa: F401 +from ray.data._internal.tensor_extensions.utils import ( + create_ragged_ndarray, # noqa: F401 +) diff --git a/python/ray/data/iterator.py b/python/ray/data/iterator.py index 3a0339807469..ee91e4fba62e 100644 --- a/python/ray/data/iterator.py +++ b/python/ray/data/iterator.py @@ -430,7 +430,7 @@ def iter_torch_batches( # Ray Train is not being used. device = get_device() if _in_ray_train_worker() else "cpu" - from ray.air._internal.torch_utils import ( + from ray.data.util.torch_utils import ( move_tensors_to_device, ) @@ -559,7 +559,7 @@ def iter_tf_batches( Returns: An iterator over TensorFlow Tensor batches. """ - from ray.air._internal.tensorflow_utils import ( + from ray.data._internal.utils.tensorflow_utils import ( convert_ndarray_batch_to_tf_tensor_batch, ) @@ -689,8 +689,8 @@ def to_torch( """ import torch - from ray.air._internal.torch_utils import convert_pandas_to_torch_tensor from ray.data._internal.torch_iterable_dataset import TorchIterableDataset + from ray.data.util.torch_utils import convert_pandas_to_torch_tensor # If an empty collection is passed in, treat it the same as None if not feature_columns: @@ -904,7 +904,7 @@ def to_tf( A ``tf.data.Dataset`` that yields inputs and targets. """ # noqa: E501 - from ray.air._internal.tensorflow_utils import ( + from ray.data._internal.utils.tensorflow_utils import ( convert_ndarray_to_tf_tensor, get_type_spec, ) diff --git a/python/ray/data/preprocessor.py b/python/ray/data/preprocessor.py index ca4a181c7b35..5952d02b403f 100644 --- a/python/ray/data/preprocessor.py +++ b/python/ray/data/preprocessor.py @@ -7,7 +7,7 @@ from enum import Enum from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, final -from ray.air.util.data_batch_conversion import BatchFormat +from ray.data.util.data_batch_conversion import BatchFormat from ray.util.annotations import DeveloperAPI, PublicAPI if TYPE_CHECKING: @@ -319,7 +319,7 @@ def _transform_batch(self, data: "DataBatchType") -> "DataBatchType": import numpy as np import pandas as pd - from ray.air.util.data_batch_conversion import ( + from ray.data.util.data_batch_conversion import ( _convert_batch_type_to_numpy, _convert_batch_type_to_pandas, ) diff --git a/python/ray/data/preprocessors/chain.py b/python/ray/data/preprocessors/chain.py index bfe53ca06ac3..3ac6a01bda9f 100644 --- a/python/ray/data/preprocessors/chain.py +++ b/python/ray/data/preprocessors/chain.py @@ -1,7 +1,7 @@ from typing import TYPE_CHECKING, Optional -from ray.air.util.data_batch_conversion import BatchFormat from ray.data.preprocessor import Preprocessor +from ray.data.util.data_batch_conversion import BatchFormat if TYPE_CHECKING: from ray.air.data_batch_type import DataBatchType diff --git a/python/ray/data/preprocessors/encoder.py b/python/ray/data/preprocessors/encoder.py index 4686affeb346..fbd1e549e2d2 100644 --- a/python/ray/data/preprocessors/encoder.py +++ b/python/ray/data/preprocessors/encoder.py @@ -7,7 +7,6 @@ import pandas as pd import pandas.api.types -from ray.air.util.data_batch_conversion import BatchFormat from ray.data._internal.util import is_null from ray.data.preprocessor import ( Preprocessor, @@ -16,6 +15,7 @@ ) from ray.data.preprocessors.utils import make_post_processor from ray.data.preprocessors.version_support import SerializablePreprocessor +from ray.data.util.data_batch_conversion import BatchFormat from ray.util.annotations import PublicAPI if TYPE_CHECKING: diff --git a/python/ray/data/preprocessors/torch.py b/python/ray/data/preprocessors/torch.py index 6f1323cef07e..88415d09b83a 100644 --- a/python/ray/data/preprocessors/torch.py +++ b/python/ray/data/preprocessors/torch.py @@ -2,9 +2,9 @@ import numpy as np -from ray.air.util.data_batch_conversion import BatchFormat -from ray.air.util.tensor_extensions.utils import _create_possibly_ragged_ndarray +from ray.data._internal.tensor_extensions.utils import _create_possibly_ragged_ndarray from ray.data.preprocessor import Preprocessor +from ray.data.util.data_batch_conversion import BatchFormat from ray.util.annotations import PublicAPI if TYPE_CHECKING: @@ -110,7 +110,7 @@ def _transform_numpy( ) -> Dict[str, "np.ndarray"]: import torch - from ray.air._internal.torch_utils import convert_ndarray_to_torch_tensor + from ray.data.util.torch_utils import convert_ndarray_to_torch_tensor def apply_torchvision_transform(array: np.ndarray) -> np.ndarray: try: diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index 73876485e98a..d3af2e802436 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -22,7 +22,6 @@ import ray from ray._private.arrow_utils import get_pyarrow_version from ray._private.auto_init_hook import wrap_auto_init -from ray.air.util.tensor_extensions.utils import _create_possibly_ragged_ndarray from ray.data._internal.compute import TaskPoolStrategy from ray.data._internal.datasource.audio_datasource import AudioDatasource from ray.data._internal.datasource.avro_datasource import AvroDatasource @@ -73,6 +72,7 @@ from ray.data._internal.plan import ExecutionPlan from ray.data._internal.remote_fn import cached_remote_fn from ray.data._internal.stats import DatasetStats +from ray.data._internal.tensor_extensions.utils import _create_possibly_ragged_ndarray from ray.data._internal.util import ( _autodetect_parallelism, get_table_block_metadata_schema, @@ -3106,7 +3106,7 @@ def from_pandas( ary = dfs[0] dfs = np.array_split(ary, override_num_blocks) - from ray.air.util.data_batch_conversion import ( + from ray.data.util.data_batch_conversion import ( _cast_ndarray_columns_to_tensor_extension, ) diff --git a/python/ray/data/stats.py b/python/ray/data/stats.py index d8fadbe644e6..bc2e9e497c40 100644 --- a/python/ray/data/stats.py +++ b/python/ray/data/stats.py @@ -5,7 +5,7 @@ import pandas as pd import pyarrow as pa -from ray.air.util.tensor_extensions.arrow import convert_to_pyarrow_array +from ray.data._internal.tensor_extensions.arrow import convert_to_pyarrow_array from ray.data.aggregate import ( AggregateFnV2, ApproximateQuantile, diff --git a/python/ray/data/tests/conftest.py b/python/ray/data/tests/conftest.py index a03ced9b2217..29ede5d1037d 100644 --- a/python/ray/data/tests/conftest.py +++ b/python/ray/data/tests/conftest.py @@ -13,9 +13,9 @@ from ray._common.test_utils import wait_for_condition from ray._private.arrow_utils import get_pyarrow_version from ray._private.internal_api import get_memory_info_reply, get_state_from_address -from ray.air.constants import TENSOR_COLUMN_NAME -from ray.air.util.tensor_extensions.arrow import ArrowTensorArray +from ray.data._internal.tensor_extensions.arrow import ArrowTensorArray from ray.data.block import BlockExecStats, BlockMetadata +from ray.data.constants import TENSOR_COLUMN_NAME from ray.data.context import DEFAULT_TARGET_MAX_BLOCK_SIZE, DataContext, ShuffleStrategy from ray.data.tests.mock_server import * # noqa diff --git a/python/ray/data/tests/preprocessors/test_chain.py b/python/ray/data/tests/preprocessors/test_chain.py index 85961d0b7c8f..c7011267a526 100644 --- a/python/ray/data/tests/preprocessors/test_chain.py +++ b/python/ray/data/tests/preprocessors/test_chain.py @@ -2,9 +2,9 @@ import pytest import ray -from ray.air.util.data_batch_conversion import BatchFormat from ray.data.preprocessor import Preprocessor from ray.data.preprocessors import Chain, LabelEncoder, SimpleImputer, StandardScaler +from ray.data.util.data_batch_conversion import BatchFormat def test_chain(): diff --git a/python/ray/data/tests/preprocessors/test_preprocessors.py b/python/ray/data/tests/preprocessors/test_preprocessors.py index 7b3fa84ca95e..82729923fd01 100644 --- a/python/ray/data/tests/preprocessors/test_preprocessors.py +++ b/python/ray/data/tests/preprocessors/test_preprocessors.py @@ -9,8 +9,7 @@ import pytest import ray -from ray.air.constants import MAX_REPR_LENGTH -from ray.air.util.data_batch_conversion import BatchFormat +from ray.data.constants import MAX_REPR_LENGTH from ray.data.preprocessor import Preprocessor from ray.data.preprocessors import ( Categorizer, @@ -33,6 +32,7 @@ Tokenizer, TorchVisionPreprocessor, ) +from ray.data.util.data_batch_conversion import BatchFormat @pytest.fixture diff --git a/python/ray/data/tests/test_arrow_block.py b/python/ray/data/tests/test_arrow_block.py index 6f1f51f14fb0..4bb33a59a911 100644 --- a/python/ray/data/tests/test_arrow_block.py +++ b/python/ray/data/tests/test_arrow_block.py @@ -13,9 +13,6 @@ import ray from ray._private.test_utils import run_string_as_driver -from ray.air.util.tensor_extensions.arrow import ( - ArrowTensorArray, -) from ray.data._internal.arrow_block import ( ArrowBlockAccessor, ArrowBlockBuilder, @@ -23,6 +20,9 @@ _get_max_chunk_size, ) from ray.data._internal.arrow_ops.transform_pyarrow import combine_chunked_array +from ray.data._internal.tensor_extensions.arrow import ( + ArrowTensorArray, +) from ray.data._internal.util import GiB, MiB from ray.data.block import BlockAccessor from ray.data.context import DataContext diff --git a/python/ray/data/tests/test_daft.py b/python/ray/data/tests/test_daft.py index 435b44d48e5e..126445562d3d 100644 --- a/python/ray/data/tests/test_daft.py +++ b/python/ray/data/tests/test_daft.py @@ -20,8 +20,8 @@ def ray_start(request): os.environ["RAY_DATA_ARROW_EXTENSION_SERIALIZATION_LEGACY_JSON_FORMAT"] = "1" import ray - import ray.air.util.tensor_extensions.arrow as arrow_module - from ray.air.util.tensor_extensions.arrow import _SerializationFormat + import ray.data._internal.tensor_extensions.arrow as arrow_module + from ray.data._internal.tensor_extensions.arrow import _SerializationFormat # Force the serialization format to JSON after import arrow_module.ARROW_EXTENSION_SERIALIZATION_FORMAT = _SerializationFormat.JSON diff --git a/python/ray/data/tests/test_ecosystem_dask.py b/python/ray/data/tests/test_ecosystem_dask.py index 755f930682d8..f59e1d555bbf 100644 --- a/python/ray/data/tests/test_ecosystem_dask.py +++ b/python/ray/data/tests/test_ecosystem_dask.py @@ -6,7 +6,7 @@ import pytest import ray -from ray.air.util.tensor_extensions.arrow import ( +from ray.data._internal.tensor_extensions.arrow import ( get_arrow_extension_fixed_shape_tensor_types, ) from ray.data.extensions.tensor_extension import ( diff --git a/python/ray/data/tests/test_image.py b/python/ray/data/tests/test_image.py index 535c3d6ab64a..afb0ae70ef0b 100644 --- a/python/ray/data/tests/test_image.py +++ b/python/ray/data/tests/test_image.py @@ -8,13 +8,13 @@ from PIL import Image import ray -from ray.air.util.tensor_extensions.arrow import ( - get_arrow_extension_fixed_shape_tensor_types, -) from ray.data._internal.datasource.image_datasource import ( ImageDatasource, ImageFileMetadataProvider, ) +from ray.data._internal.tensor_extensions.arrow import ( + get_arrow_extension_fixed_shape_tensor_types, +) from ray.data.datasource.file_meta_provider import FastFileMetadataProvider from ray.data.tests.conftest import * # noqa from ray.data.tests.mock_http_server import * # noqa diff --git a/python/ray/data/tests/test_numpy.py b/python/ray/data/tests/test_numpy.py index e95a81b863f6..25a0662a2e4f 100644 --- a/python/ray/data/tests/test_numpy.py +++ b/python/ray/data/tests/test_numpy.py @@ -6,7 +6,7 @@ import pytest import ray -from ray.air.util.tensor_extensions.arrow import ArrowTensorTypeV2 +from ray.data._internal.tensor_extensions.arrow import ArrowTensorTypeV2 from ray.data.context import DataContext from ray.data.dataset import Schema from ray.data.datasource import ( diff --git a/python/ray/data/tests/test_numpy_support.py b/python/ray/data/tests/test_numpy_support.py index 49731d612a2c..4f1332223faf 100644 --- a/python/ray/data/tests/test_numpy_support.py +++ b/python/ray/data/tests/test_numpy_support.py @@ -6,7 +6,7 @@ import torch import ray -from ray.air.util.tensor_extensions.utils import create_ragged_ndarray +from ray.data._internal.tensor_extensions.utils import create_ragged_ndarray from ray.data.context import DataContext from ray.data.tests.conftest import * # noqa from ray.tests.conftest import * # noqa diff --git a/python/ray/data/tests/test_pandas.py b/python/ray/data/tests/test_pandas.py index 43e75ab45f76..bdfa4d33f886 100644 --- a/python/ray/data/tests/test_pandas.py +++ b/python/ray/data/tests/test_pandas.py @@ -6,12 +6,13 @@ import pytest import ray -from ray.air.util.tensor_extensions.arrow import ( +from ray.data._internal.execution.interfaces.ref_bundle import RefBundle +from ray.data._internal.tensor_extensions.arrow import ( + ArrowTensorArray, get_arrow_extension_fixed_shape_tensor_types, ) -from ray.data._internal.execution.interfaces.ref_bundle import RefBundle from ray.data.block import Block -from ray.data.extensions import ArrowTensorArray, TensorDtype +from ray.data.extensions import TensorDtype from ray.data.tests.conftest import * # noqa from ray.data.tests.mock_http_server import * # noqa from ray.tests.conftest import * # noqa diff --git a/python/ray/data/tests/test_parquet.py b/python/ray/data/tests/test_parquet.py index c9ee45e2b2c2..962c0f2960a8 100644 --- a/python/ray/data/tests/test_parquet.py +++ b/python/ray/data/tests/test_parquet.py @@ -18,10 +18,6 @@ import ray from ray._private.arrow_utils import get_pyarrow_version -from ray.air.util.tensor_extensions.arrow import ( - ArrowTensorTypeV2, - get_arrow_extension_fixed_shape_tensor_types, -) from ray.data import FileShuffleConfig, Schema from ray.data._internal.datasource.parquet_datasource import ( ParquetDatasource, @@ -29,6 +25,10 @@ from ray.data._internal.execution.interfaces.ref_bundle import ( _ref_bundles_iterator_to_block_refs_list, ) +from ray.data._internal.tensor_extensions.arrow import ( + ArrowTensorTypeV2, + get_arrow_extension_fixed_shape_tensor_types, +) from ray.data._internal.util import rows_same from ray.data.block import BlockAccessor from ray.data.context import DataContext diff --git a/python/ray/data/tests/test_strict_mode.py b/python/ray/data/tests/test_strict_mode.py index 70bed964e5f6..cc84d2bf5239 100644 --- a/python/ray/data/tests/test_strict_mode.py +++ b/python/ray/data/tests/test_strict_mode.py @@ -6,7 +6,7 @@ import pytest import ray -from ray.air.util.tensor_extensions.pandas import TensorDtype +from ray.data._internal.tensor_extensions.pandas import TensorDtype from ray.data.context import DataContext from ray.data.dataset import Schema from ray.data.tests.conftest import * # noqa @@ -230,7 +230,7 @@ def test_strict_schema(ray_start_regular_shared_2_cpus): assert isinstance(schema.base_schema, pa.lib.Schema) assert schema.names == ["data"] - from ray.air.util.tensor_extensions.arrow import ArrowTensorTypeV2 + from ray.data._internal.tensor_extensions.arrow import ArrowTensorTypeV2 from ray.data.context import DataContext if DataContext.get_current().use_arrow_tensor_v2: diff --git a/python/ray/data/tests/test_tensor.py b/python/ray/data/tests/test_tensor.py index 11abcd02624e..37a2cda8e6a7 100644 --- a/python/ray/data/tests/test_tensor.py +++ b/python/ray/data/tests/test_tensor.py @@ -7,12 +7,12 @@ import pytest import ray -from ray.air.util.tensor_extensions.utils import _create_possibly_ragged_ndarray +from ray.data._internal.tensor_extensions.arrow import ArrowTensorArray +from ray.data._internal.tensor_extensions.utils import _create_possibly_ragged_ndarray from ray.data.block import BlockAccessor from ray.data.context import DataContext from ray.data.dataset import Schema from ray.data.extensions.tensor_extension import ( - ArrowTensorArray, ArrowTensorType, ArrowTensorTypeV2, ArrowVariableShapedTensorArray, diff --git a/python/ray/data/tests/test_tensor_extension.py b/python/ray/data/tests/test_tensor_extension.py index 1872f42d1c8b..1c14276c426b 100644 --- a/python/ray/data/tests/test_tensor_extension.py +++ b/python/ray/data/tests/test_tensor_extension.py @@ -9,7 +9,8 @@ from packaging.version import parse as parse_version from ray._private.arrow_utils import get_pyarrow_version -from ray.air.util.tensor_extensions.arrow import ( +from ray.data import DataContext +from ray.data._internal.tensor_extensions.arrow import ( ArrowConversionError, ArrowTensorArray, ArrowTensorType, @@ -22,11 +23,10 @@ concat_tensor_arrays, unify_tensor_arrays, ) -from ray.air.util.tensor_extensions.pandas import TensorArray, TensorDtype -from ray.air.util.tensor_extensions.utils import ( +from ray.data._internal.tensor_extensions.pandas import TensorArray, TensorDtype +from ray.data._internal.tensor_extensions.utils import ( create_ragged_ndarray, ) -from ray.data import DataContext @pytest.mark.parametrize("tensor_format", ["v1", "v2"]) diff --git a/python/ray/data/tests/test_torch_tensor_utils.py b/python/ray/data/tests/test_torch_tensor_utils.py index 16e59ed1c0b0..d0be8138628b 100644 --- a/python/ray/data/tests/test_torch_tensor_utils.py +++ b/python/ray/data/tests/test_torch_tensor_utils.py @@ -3,7 +3,7 @@ import pytest import torch -from ray.air._internal.torch_utils import ( +from ray.data.util.torch_utils import ( concat_tensors_to_device, move_tensors_to_device, ) diff --git a/python/ray/data/tests/test_transform_pyarrow.py b/python/ray/data/tests/test_transform_pyarrow.py index 1516e9015d82..8f0db556dd90 100644 --- a/python/ray/data/tests/test_transform_pyarrow.py +++ b/python/ray/data/tests/test_transform_pyarrow.py @@ -10,10 +10,6 @@ import ray from ray._private.arrow_utils import get_pyarrow_version -from ray.air.util.tensor_extensions.arrow import ( - ArrowTensorTypeV2, - _extension_array_concat_supported, -) from ray.data._internal.arrow_ops.transform_pyarrow import ( MIN_PYARROW_VERSION_TYPE_PROMOTION, _align_struct_fields, @@ -23,6 +19,10 @@ try_combine_chunked_columns, unify_schemas, ) +from ray.data._internal.tensor_extensions.arrow import ( + ArrowTensorTypeV2, + _extension_array_concat_supported, +) from ray.data.block import BlockAccessor from ray.data.context import DataContext from ray.data.extensions import ( @@ -2712,7 +2712,7 @@ def struct_variable_shaped_tensor_expected(): @pytest.fixture def unify_schemas_object_types_schemas(): """Fixture for object types unify schemas test data.""" - from ray.air.util.object_extensions.arrow import ArrowPythonObjectType + from ray.data._internal.object_extensions.arrow import ArrowPythonObjectType schema1 = pa.schema([("obj_col", ArrowPythonObjectType())]) schema2 = pa.schema([("obj_col", pa.int32())]) @@ -2738,7 +2738,7 @@ def unify_schemas_incompatible_tensor_schemas(): @pytest.fixture def unify_schemas_objects_and_tensors_schemas(): """Fixture for objects and tensors unify schemas test data.""" - from ray.air.util.object_extensions.arrow import ArrowPythonObjectType + from ray.data._internal.object_extensions.arrow import ArrowPythonObjectType schema1 = pa.schema([("col", ArrowPythonObjectType())]) schema2 = pa.schema([("col", ArrowTensorType((2, 2), pa.int32()))]) diff --git a/python/ray/data/tests/unit/test_arrow_type_conversion.py b/python/ray/data/tests/unit/test_arrow_type_conversion.py index 14c5d1f5d1a9..9a8fc4fe940e 100644 --- a/python/ray/data/tests/unit/test_arrow_type_conversion.py +++ b/python/ray/data/tests/unit/test_arrow_type_conversion.py @@ -7,16 +7,16 @@ from packaging.version import parse as parse_version from ray._private.arrow_utils import get_pyarrow_version -from ray.air.util.tensor_extensions.arrow import ( +from ray.data import DataContext +from ray.data._internal.execution.util import memory_string +from ray.data._internal.tensor_extensions.arrow import ( ArrowConversionError, ArrowTensorArray, _convert_to_pyarrow_native_array, _infer_pyarrow_type, convert_to_pyarrow_array, ) -from ray.air.util.tensor_extensions.utils import create_ragged_ndarray -from ray.data import DataContext -from ray.data._internal.execution.util import memory_string +from ray.data._internal.tensor_extensions.utils import create_ragged_ndarray from ray.data._internal.util import MiB from ray.tests.conftest import * # noqa diff --git a/python/ray/data/tests/unit/test_data_batch_conversion.py b/python/ray/data/tests/unit/test_data_batch_conversion.py index de7315aabe8c..c0be70b9ba86 100644 --- a/python/ray/data/tests/unit/test_data_batch_conversion.py +++ b/python/ray/data/tests/unit/test_data_batch_conversion.py @@ -5,9 +5,10 @@ import pyarrow as pa import pytest -from ray.air._internal.torch_utils import convert_ndarray_to_torch_tensor -from ray.air.constants import TENSOR_COLUMN_NAME -from ray.air.util.data_batch_conversion import ( +from ray.data._internal.tensor_extensions.arrow import ArrowTensorArray +from ray.data._internal.tensor_extensions.pandas import TensorArray +from ray.data.constants import TENSOR_COLUMN_NAME +from ray.data.util.data_batch_conversion import ( BatchFormat, _cast_ndarray_columns_to_tensor_extension, _cast_tensor_columns_to_ndarrays, @@ -15,8 +16,7 @@ _convert_batch_type_to_pandas, _convert_pandas_to_batch_type, ) -from ray.air.util.tensor_extensions.arrow import ArrowTensorArray -from ray.air.util.tensor_extensions.pandas import TensorArray +from ray.data.util.torch_utils import convert_ndarray_to_torch_tensor def test_pandas_pandas(): diff --git a/python/ray/data/tests/unit/test_object_extension.py b/python/ray/data/tests/unit/test_object_extension.py index b95f4c44a958..381a212688ac 100644 --- a/python/ray/data/tests/unit/test_object_extension.py +++ b/python/ray/data/tests/unit/test_object_extension.py @@ -5,12 +5,12 @@ import pyarrow as pa import pytest -from ray.air.util.object_extensions.arrow import ( +from ray.data._internal.object_extensions.arrow import ( ArrowPythonObjectArray, ArrowPythonObjectType, _object_extension_type_allowed, ) -from ray.air.util.object_extensions.pandas import PythonObjectArray +from ray.data._internal.object_extensions.pandas import PythonObjectArray @pytest.mark.skipif( diff --git a/python/ray/data/util/__init__.py b/python/ray/data/util/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/ray/data/util/data_batch_conversion.py b/python/ray/data/util/data_batch_conversion.py new file mode 100644 index 000000000000..dc770d2145dd --- /dev/null +++ b/python/ray/data/util/data_batch_conversion.py @@ -0,0 +1,353 @@ +import warnings +from enum import Enum +from typing import TYPE_CHECKING, Dict, List, Union + +import numpy as np + +from ray.air.data_batch_type import DataBatchType +from ray.data.constants import TENSOR_COLUMN_NAME +from ray.util.annotations import Deprecated, DeveloperAPI + +if TYPE_CHECKING: + import pandas as pd + +# TODO: Consolidate data conversion edges for arrow bug workaround. +try: + import pyarrow +except ImportError: + pyarrow = None + +# Lazy import to avoid ray init failures without pandas installed and allow +# dataset to import modules in this file. +_pandas = None + + +def _lazy_import_pandas(): + global _pandas + if _pandas is None: + import pandas + + _pandas = pandas + return _pandas + + +@DeveloperAPI +class BatchFormat(str, Enum): + PANDAS = "pandas" + # TODO: Remove once Arrow is deprecated as user facing batch format + ARROW = "arrow" + NUMPY = "numpy" # Either a single numpy array or a Dict of numpy arrays. + + +@DeveloperAPI +class BlockFormat(str, Enum): + """Internal Dataset block format enum.""" + + PANDAS = "pandas" + ARROW = "arrow" + SIMPLE = "simple" + + +def _convert_batch_type_to_pandas( + data: DataBatchType, + cast_tensor_columns: bool = False, +) -> "pd.DataFrame": + """Convert the provided data to a Pandas DataFrame. + + Args: + data: Data of type DataBatchType + cast_tensor_columns: Whether tensor columns should be cast to NumPy ndarrays. + + Returns: + A pandas Dataframe representation of the input data. + + """ + pd = _lazy_import_pandas() + + if isinstance(data, np.ndarray): + data = pd.DataFrame({TENSOR_COLUMN_NAME: _ndarray_to_column(data)}) + elif isinstance(data, dict): + tensor_dict = {} + for col_name, col in data.items(): + if not isinstance(col, np.ndarray): + raise ValueError( + "All values in the provided dict must be of type " + f"np.ndarray. Found type {type(col)} for key {col_name} " + f"instead." + ) + tensor_dict[col_name] = _ndarray_to_column(col) + data = pd.DataFrame(tensor_dict) + elif pyarrow is not None and isinstance(data, pyarrow.Table): + data = data.to_pandas() + elif not isinstance(data, pd.DataFrame): + raise ValueError( + f"Received data of type: {type(data)}, but expected it to be one " + f"of {DataBatchType}" + ) + if cast_tensor_columns: + data = _cast_tensor_columns_to_ndarrays(data) + return data + + +def _convert_pandas_to_batch_type( + data: "pd.DataFrame", + type: BatchFormat, + cast_tensor_columns: bool = False, +) -> DataBatchType: + """Convert the provided Pandas dataframe to the provided ``type``. + + Args: + data: A Pandas DataFrame + type: The specific ``BatchFormat`` to convert to. + cast_tensor_columns: Whether tensor columns should be cast to our tensor + extension type. + + Returns: + The input data represented with the provided type. + """ + if cast_tensor_columns: + data = _cast_ndarray_columns_to_tensor_extension(data) + if type == BatchFormat.PANDAS: + return data + + elif type == BatchFormat.NUMPY: + if len(data.columns) == 1: + # If just a single column, return as a single numpy array. + return data.iloc[:, 0].to_numpy() + else: + # Else return as a dict of numpy arrays. + output_dict = {} + for column in data: + output_dict[column] = data[column].to_numpy() + return output_dict + + elif type == BatchFormat.ARROW: + if not pyarrow: + raise ValueError( + "Attempted to convert data to Pyarrow Table but Pyarrow " + "is not installed. Please do `pip install pyarrow` to " + "install Pyarrow." + ) + return pyarrow.Table.from_pandas(data) + + else: + raise ValueError( + f"Received type {type}, but expected it to be one of {DataBatchType}" + ) + + +@Deprecated +def convert_batch_type_to_pandas( + data: DataBatchType, + cast_tensor_columns: bool = False, +): + """Convert the provided data to a Pandas DataFrame. + + This API is deprecated from Ray 2.4. + + Args: + data: Data of type DataBatchType + cast_tensor_columns: Whether tensor columns should be cast to NumPy ndarrays. + + Returns: + A pandas Dataframe representation of the input data. + + """ + warnings.warn( + "`convert_batch_type_to_pandas` is deprecated as a developer API " + "starting from Ray 2.4. All batch format conversions should be " + "done manually instead of relying on this API.", + PendingDeprecationWarning, + ) + return _convert_batch_type_to_pandas( + data=data, cast_tensor_columns=cast_tensor_columns + ) + + +@Deprecated +def convert_pandas_to_batch_type( + data: "pd.DataFrame", + type: BatchFormat, + cast_tensor_columns: bool = False, +): + """Convert the provided Pandas dataframe to the provided ``type``. + + Args: + data: A Pandas DataFrame + type: The specific ``BatchFormat`` to convert to. + cast_tensor_columns: Whether tensor columns should be cast to our tensor + extension type. + + Returns: + The input data represented with the provided type. + """ + warnings.warn( + "`convert_pandas_to_batch_type` is deprecated as a developer API " + "starting from Ray 2.4. All batch format conversions should be " + "done manually instead of relying on this API.", + PendingDeprecationWarning, + ) + return _convert_pandas_to_batch_type( + data=data, type=type, cast_tensor_columns=cast_tensor_columns + ) + + +def _convert_batch_type_to_numpy( + data: DataBatchType, +) -> Union[np.ndarray, Dict[str, np.ndarray]]: + """Convert the provided data to a NumPy ndarray or dict of ndarrays. + + Args: + data: Data of type DataBatchType + + Returns: + A numpy representation of the input data. + """ + pd = _lazy_import_pandas() + + if isinstance(data, np.ndarray): + return data + elif isinstance(data, dict): + for col_name, col in data.items(): + if not isinstance(col, np.ndarray): + raise ValueError( + "All values in the provided dict must be of type " + f"np.ndarray. Found type {type(col)} for key {col_name} " + f"instead." + ) + return data + elif pyarrow is not None and isinstance(data, pyarrow.Table): + from ray.data._internal.arrow_ops import transform_pyarrow + from ray.data._internal.tensor_extensions.arrow import ( + get_arrow_extension_fixed_shape_tensor_types, + ) + + column_values_ndarrays = [] + + for col in data.columns: + # Combine columnar values arrays to make these contiguous + # (making them compatible with numpy format) + combined_array = transform_pyarrow.combine_chunked_array(col) + + column_values_ndarrays.append( + transform_pyarrow.to_numpy(combined_array, zero_copy_only=False) + ) + + arrow_fixed_shape_tensor_types = get_arrow_extension_fixed_shape_tensor_types() + + # NOTE: This branch is here for backwards-compatibility + if data.column_names == [TENSOR_COLUMN_NAME] and ( + isinstance(data.schema.types[0], arrow_fixed_shape_tensor_types) + ): + return column_values_ndarrays[0] + + return dict(zip(data.column_names, column_values_ndarrays)) + elif isinstance(data, pd.DataFrame): + return _convert_pandas_to_batch_type(data, BatchFormat.NUMPY) + else: + raise ValueError( + f"Received data of type: {type(data)}, but expected it to be one " + f"of {DataBatchType}" + ) + + +def _ndarray_to_column(arr: np.ndarray) -> Union["pd.Series", List[np.ndarray]]: + """Convert a NumPy ndarray into an appropriate column format for insertion into a + pandas DataFrame. + + If conversion to a pandas Series fails (e.g. if the ndarray is multi-dimensional), + fall back to a list of NumPy ndarrays. + """ + pd = _lazy_import_pandas() + try: + # Try to convert to Series, falling back to a list conversion if this fails + # (e.g. if the ndarray is multi-dimensional). + return pd.Series(arr) + except ValueError: + return list(arr) + + +def _unwrap_ndarray_object_type_if_needed(arr: np.ndarray) -> np.ndarray: + """Unwrap an object-dtyped NumPy ndarray containing ndarray pointers into a single + contiguous ndarray, if needed/possible. + """ + if arr.dtype.type is np.object_: + try: + # Try to convert the NumPy ndarray to a non-object dtype. + arr = np.array([np.asarray(v) for v in arr]) + except Exception: + # This may fail if the subndarrays are of heterogeneous shape + pass + return arr + + +def _cast_ndarray_columns_to_tensor_extension(df: "pd.DataFrame") -> "pd.DataFrame": + """ + Cast all NumPy ndarray columns in df to our tensor extension type, TensorArray. + """ + pd = _lazy_import_pandas() + try: + SettingWithCopyWarning = pd.core.common.SettingWithCopyWarning + except AttributeError: + # SettingWithCopyWarning was moved to pd.errors in Pandas 1.5.0. + SettingWithCopyWarning = pd.errors.SettingWithCopyWarning + + from ray.data._internal.tensor_extensions.pandas import ( + TensorArray, + column_needs_tensor_extension, + ) + + # Try to convert any ndarray columns to TensorArray columns. + # TODO(Clark): Once Pandas supports registering extension types for type + # inference on construction, implement as much for NumPy ndarrays and remove + # this. See https://github.com/pandas-dev/pandas/issues/41848 + # TODO(Clark): Optimize this with propagated DataFrame metadata containing a list of + # column names containing tensor columns, to make this an O(# of tensor columns) + # check rather than the current O(# of columns) check. + for col_name, col in df.items(): + if column_needs_tensor_extension(col): + try: + # Suppress Pandas warnings: + # https://github.com/ray-project/ray/issues/29270 + # We actually want in-place operations so we surpress this warning. + # https://stackoverflow.com/a/74193599 + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + warnings.simplefilter("ignore", category=SettingWithCopyWarning) + df[col_name] = TensorArray(col) + except Exception as e: + raise ValueError( + f"Tried to cast column {col_name} to the TensorArray tensor " + "extension type but the conversion failed. To disable " + "automatic casting to this tensor extension, set " + "ctx = DataContext.get_current(); " + "ctx.enable_tensor_extension_casting = False." + ) from e + return df + + +def _cast_tensor_columns_to_ndarrays(df: "pd.DataFrame") -> "pd.DataFrame": + """Cast all tensor extension columns in df to NumPy ndarrays.""" + pd = _lazy_import_pandas() + try: + SettingWithCopyWarning = pd.core.common.SettingWithCopyWarning + except AttributeError: + # SettingWithCopyWarning was moved to pd.errors in Pandas 1.5.0. + SettingWithCopyWarning = pd.errors.SettingWithCopyWarning + from ray.data._internal.tensor_extensions.pandas import TensorDtype + + # Try to convert any tensor extension columns to ndarray columns. + # TODO(Clark): Optimize this with propagated DataFrame metadata containing a list of + # column names containing tensor columns, to make this an O(# of tensor columns) + # check rather than the current O(# of columns) check. + for col_name, col in df.items(): + if isinstance(col.dtype, TensorDtype): + # Suppress Pandas warnings: + # https://github.com/ray-project/ray/issues/29270 + # We actually want in-place operations so we surpress this warning. + # https://stackoverflow.com/a/74193599 + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + warnings.simplefilter("ignore", category=SettingWithCopyWarning) + df[col_name] = list(col.to_numpy()) + return df diff --git a/python/ray/data/util/torch_utils.py b/python/ray/data/util/torch_utils.py new file mode 100644 index 000000000000..fe8e73e57f1a --- /dev/null +++ b/python/ray/data/util/torch_utils.py @@ -0,0 +1,618 @@ +import warnings +from concurrent.futures import ThreadPoolExecutor +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union + +import numpy as np +import pandas as pd +import pyarrow +import torch + +from ray._private.ray_constants import env_bool +from ray.data.collate_fn import ( + TensorBatchReturnType, + TensorBatchType, + _is_nested_tensor_sequence, + _is_tensor, + _is_tensor_mapping, + _is_tensor_sequence, + _is_tensor_sequence_mapping, +) +from ray.data.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + +# Default non-blocking transfer for tensors. +DEFAULT_TENSOR_NON_BLOCKING_TRANSFER = env_bool( + "RAY_AIR_DEFAULT_TENSOR_NON_BLOCKING_TRANSFER", + True, +) + + +def convert_pandas_to_torch_tensor( + data_batch: pd.DataFrame, + columns: Optional[Union[List[str], List[List[str]]]] = None, + column_dtypes: Optional[Union[torch.dtype, List[torch.dtype]]] = None, + unsqueeze: bool = True, +) -> Union[torch.Tensor, List[torch.Tensor]]: + """Converts a Pandas dataframe to a torch Tensor or list of torch Tensors. + + The format of the return type will match the format of ``columns``. If a + list of columns is provided, the return type will be a single tensor. If + ``columns`` is a list of lists, then the return type will be a list of + tensors. + + Args: + data_batch: The pandas dataframe to convert to a + torch tensor. + columns: + The names of the columns in the dataframe to include in the + torch tensor. If this arg is a List[List[str]], then the return + type will be a List of tensors. This is useful for multi-input + models. If None, then use all columns in the ``data_batch``. + column_dtypes: The + torch dtype to use for the tensor. If set to None, + then automatically infer the dtype. + unsqueeze: If set to True, the tensors + will be unsqueezed (reshaped to (N, 1)) before being concatenated into + the final tensor. Otherwise, they will be left as is, that is + (N, ). Defaults to True. + + Returns: + Either a torch tensor of size (N, len(columns)) where N is the + number of rows in the ``data_batch`` Dataframe, or a list of + tensors, where the size of item i is (N, len(columns[i])). + + """ + + multi_input = columns and (isinstance(columns[0], (list, tuple))) + + if not multi_input and column_dtypes and not isinstance(column_dtypes, torch.dtype): + raise TypeError( + "If `columns` is a list of strings, " + "`column_dtypes` must be None or a single `torch.dtype`." + f"Got {type(column_dtypes)} instead." + ) + + columns = columns if columns else [] + + def tensorize(vals, dtype): + """This recursive function allows to convert pyarrow List dtypes + to multi-dimensional tensors.""" + if isinstance(vals, pd.api.extensions.ExtensionArray): + # torch.as_tensor() does not yet support the __array__ protocol, so we need + # to convert extension arrays to ndarrays manually before converting to a + # Torch tensor. + # See https://github.com/pytorch/pytorch/issues/51156. + vals = vals.to_numpy() + + if vals.dtype.type is np.object_: + # Column has an object dtype which Torch can't handle, so we try to + # tensorize each column element and then stack the resulting tensors. + tensors = [tensorize(x, dtype) for x in vals] + try: + return torch.stack(tensors) + except RuntimeError: + # NOTE: RuntimeError is raised when trying to stack ragged tensors. + # Try to coerce the tensor to a nested tensor, if possible. + # If this fails, the exception will be propagated up to the caller. + return torch.nested_tensor(tensors) + else: + return torch.as_tensor(vals, dtype=dtype) + + def get_tensor_for_columns(columns, dtype): + feature_tensors = [] + + if columns: + batch = data_batch[columns] + else: + batch = data_batch + + for col in batch.columns: + col_vals = batch[col].values + try: + t = tensorize(col_vals, dtype=dtype) + except Exception as e: + raise ValueError( + f"Failed to convert column {col} to a Torch Tensor of dtype " + f"{dtype}. See above exception chain for the exact failure." + ) from e + if unsqueeze: + t = t.unsqueeze(1) + feature_tensors.append(t) + + if len(feature_tensors) > 1: + feature_tensor = torch.cat(feature_tensors, dim=1) + else: + feature_tensor = feature_tensors[0] + return feature_tensor + + if multi_input: + if type(column_dtypes) not in [list, tuple]: + column_dtypes = [column_dtypes] * len(columns) + return [ + get_tensor_for_columns(columns=subcolumns, dtype=dtype) + for subcolumns, dtype in zip(columns, column_dtypes) + ] + else: + return get_tensor_for_columns(columns=columns, dtype=column_dtypes) + + +def convert_ndarray_to_torch_tensor( + ndarray: np.ndarray, + dtype: Optional[torch.dtype] = None, + device: Optional[Union[str, "torch.device"]] = None, + pin_memory: bool = False, +) -> torch.Tensor: + """Convert a NumPy ndarray to a Torch Tensor. + + Args: + ndarray: A NumPy ndarray that we wish to convert to a Torch Tensor. + dtype: A Torch dtype for the created tensor; if None, the dtype will be + inferred from the NumPy ndarray data. + device: The device on which the tensor(s) should be placed; if None, the Torch + tensor(s) will be constructed on the CPU. + pin_memory: Whether to pin the memory of the created tensors. + + Returns: + A Torch Tensor. + """ + ndarray = _unwrap_ndarray_object_type_if_needed(ndarray) + + # Object dtype cannot be converted into PyTorch Tensor. + if ndarray.dtype.type is np.object_: + raise RuntimeError( + "Numpy array of object dtype cannot be converted to a Torch Tensor. This " + "may because the numpy array is a ragged tensor--it contains items of " + "different sizes. If using `iter_torch_batches()` API, you can pass in a " + "`collate_fn` argument to specify custom logic to convert the Numpy array " + "batch to a Torch tensor batch." + ) + + # The numpy array is not always writeable as it can come from the Ray object store. + # Numpy will throw a verbose warning here, which we suppress, as we don't write + # to the tensors. We also don't want to copy the array to avoid memory overhead. + # Original warning: https://github.com/pytorch/pytorch/blob/v1.13.0/ + # torch/csrc/utils/tensor_numpy.cpp#L198-L206 + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + result = torch.as_tensor(ndarray, dtype=dtype, device=device) + + if pin_memory: + assert result.device.type == "cpu", ( + "Pin memory is only supported for CPU tensors. " + f"Got device: {result.device} and pin_memory: {pin_memory}." + ) + result = result.pin_memory() + + return result + + +def convert_ndarray_batch_to_torch_tensor_batch( + ndarrays: Union[np.ndarray, Dict[str, np.ndarray]], + dtypes: Optional[Union[torch.dtype, Dict[str, torch.dtype]]] = None, + device: Optional[Union[str, "torch.device"]] = None, + pin_memory: bool = False, +) -> Union[torch.Tensor, Dict[str, torch.Tensor]]: + """Convert a NumPy ndarray batch to a Torch Tensor batch. + + Args: + ndarrays: A (dict of) NumPy ndarray(s) that we wish to convert to a Torch Tensor. + dtypes: A (dict of) Torch dtype(s) for the created tensor; if None, the dtype + will be inferred from the NumPy ndarray data. + device: The device on which the tensor(s) should be placed; if None, the Torch + tensor(s) will be constructed on the CPU. + pin_memory: Whether to pin the memory of the created tensors. + + Returns: + A (dict of) Torch Tensor(s). + """ + if isinstance(ndarrays, np.ndarray): + # Single-tensor case. + if isinstance(dtypes, dict): + if len(dtypes) != 1: + raise ValueError( + "When constructing a single-tensor batch, only a single dtype " + f"should be given, instead got: {dtypes}" + ) + dtypes = next(iter(dtypes.values())) + batch = convert_ndarray_to_torch_tensor( + ndarrays, + dtype=dtypes, + device=device, + pin_memory=pin_memory, + ) + else: + # Multi-tensor case. + batch = { + col_name: convert_ndarray_to_torch_tensor( + col_ndarray, + dtype=dtypes[col_name] if isinstance(dtypes, dict) else dtypes, + device=device, + pin_memory=pin_memory, + ) + for col_name, col_ndarray in ndarrays.items() + } + + return batch + + +def load_torch_model( + saved_model: Union[torch.nn.Module, Dict], + model_definition: Optional[torch.nn.Module] = None, +) -> torch.nn.Module: + """Loads a PyTorch model from the provided ``saved_model``. + + ``model_definition`` is only used when ``saved_model`` is + a torch state dict, which will be loaded into ``model_definition``. + Otherwise, ``model_definition`` is discarded. + """ + if isinstance(saved_model, torch.nn.Module): + return saved_model + elif isinstance(saved_model, dict): + if not model_definition: + raise ValueError( + "Attempting to load torch model from a " + "state_dict, but no `model_definition` was " + "provided." + ) + model_definition.load_state_dict(saved_model) + return model_definition + else: + raise ValueError( + f"Saved model is of type {type(saved_model)}. " + f"The model saved in the checkpoint is expected " + f"to be of type `torch.nn.Module`, or a model " + f"state dict of type dict." + ) + + +def contains_tensor(obj): + if isinstance(obj, torch.Tensor): + return True + elif isinstance(obj, dict): + for k, v in obj.items(): + if contains_tensor(k): + return True + if contains_tensor(v): + return True + elif isinstance(obj, (list, tuple)): + for v in obj: + if contains_tensor(v): + return True + return False + + +# Not present in torch<=1.7.0 +# Adapted from https://github.com/pytorch/pytorch/blob/\ +# c18da597e0bb1c1aecc97c77a73fed1849057fa4/torch/nn/modules/utils.py +def consume_prefix_in_state_dict_if_present_not_in_place( + state_dict: Dict[str, Any], prefix: str +) -> Dict[str, Any]: + """Strip the prefix in state_dict, if any and return a new dict. + + Adapted from https://github.com/pytorch/pytorch/blob/\ +c18da597e0bb1c1aecc97c77a73fed1849057fa4/torch/nn/modules/utils.py + The original method modified the dict in-place. + + Args: + state_dict: a state-dict to be loaded to the model. + prefix: prefix. + + Returns: + dict: A new state dict with the specified prefix removed from all keys + that started with the prefix. Keys that don't start with the prefix + remain unchanged. If no keys start with the prefix, returns a copy + of the original state dict. + """ + copied = False + + for key in state_dict: + if key.startswith(prefix): + newkey = key[len(prefix) :] + if not copied: + # We are doing shallow copies here, so the performance + # impact should be negligible anyway, but this is + # a simple optimization. + state_dict = state_dict.copy() + copied = True + state_dict[newkey] = state_dict.pop(key) + + if "_metadata" in state_dict: + state_dict["_metadata"] = state_dict["_metadata"].copy() + metadata = state_dict["_metadata"] + for key in metadata: + if len(key) == 0: + continue + newkey = key[len(prefix) :] + metadata[newkey] = metadata.pop(key) + + return state_dict + + +def convert_ndarray_list_to_torch_tensor_list( + ndarrays: Dict[str, List[np.ndarray]], + dtypes: Optional[Union[torch.dtype, Dict[str, torch.dtype]]] = None, + device: Optional[Union[str, "torch.device"]] = None, + pin_memory: bool = False, +) -> Dict[str, List[torch.Tensor]]: + """Convert a dict mapping column names to lists of ndarrays to Torch Tensors. + + Args: + ndarrays: A dict mapping column names to lists of ndarrays that we wish to convert + to Torch Tensors. + dtypes: A (dict of) Torch dtype(s) for the created tensors; if None, the dtype + will be inferred from the NumPy ndarray data. + device: The device on which the tensor(s) should be placed; if None, the Torch + tensor(s) will be constructed on the CPU. + pin_memory: Whether to pin the memory of the created tensors. + + Returns: + A dict mapping column names to lists of Tensors. + """ + return { + col_name: [ + convert_ndarray_batch_to_torch_tensor_batch( + ndarray, + dtypes=dtypes[col_name] if isinstance(dtypes, dict) else dtypes, + device=device, + pin_memory=pin_memory, + ) + for ndarray in col_ndarrays + ] + for col_name, col_ndarrays in ndarrays.items() + } + + +def arrow_batch_to_tensors( + batch: pyarrow.Table, + dtypes: Optional[Union[torch.dtype, Dict[str, torch.dtype]]] = None, + combine_chunks: bool = False, + pin_memory: bool = False, + threadpool: Optional[ThreadPoolExecutor] = None, +) -> Union[Dict[str, torch.Tensor], Dict[str, List[torch.Tensor]]]: + """Convert PyArrow batch to PyTorch tensors. + + Args: + batch: PyArrow batch to convert + dtypes: A (dict of) Torch dtype(s) for the created tensors; if None, the dtype + will be inferred from the NumPy ndarray data. + combine_chunks: If True, combine chunks in Arrow batch before converting to + tensors. + pin_memory: Whether to pin the memory of the created tensors. + threadpool: Optional ThreadPoolExecutor for parallel processing. If provided, + columns/arrays will be processed in parallel. If None, processing is + sequential. + + Returns: + When combine_chunks=True: A dictionary of column name to single tensor. + When combine_chunks=False: A dictionary of column name to list of tensors. + """ + from ray.data._internal.arrow_block import ArrowBlockAccessor + from ray.data._internal.arrow_ops import transform_pyarrow + + if combine_chunks: + numpy_batch = ArrowBlockAccessor(batch).to_batch_format("numpy") + num_columns = len(numpy_batch) + + if num_columns > 1 and threadpool is not None: + # Process columns in parallel using provided threadpool + def process_column( + col_name_col_array: Tuple[str, np.ndarray] + ) -> Tuple[str, torch.Tensor]: + col_name, col_array = col_name_col_array + return col_name, convert_ndarray_batch_to_torch_tensor_batch( + col_array, + dtypes=dtypes[col_name] if isinstance(dtypes, dict) else dtypes, + pin_memory=pin_memory, + ) + + # Submit all columns to threadpool and collect results + processed_cols = threadpool.map(process_column, numpy_batch.items()) + return dict(processed_cols) + else: + # Sequential processing for single column or single worker + return { + col_name: convert_ndarray_batch_to_torch_tensor_batch( + col_array, + dtypes=dtypes[col_name] if isinstance(dtypes, dict) else dtypes, + pin_memory=pin_memory, + ) + for col_name, col_array in numpy_batch.items() + } + else: + numpy_list = transform_pyarrow.table_to_numpy_dict_chunked( + batch, + ) + # Count total number of arrays across all columns + total_arrays = sum(len(arrays) for arrays in numpy_list.values()) + num_columns = len(numpy_list) + + if total_arrays > 1 and threadpool is not None: + # Process arrays in parallel using provided threadpool + def process_array( + array_item: Tuple[str, int, np.ndarray] + ) -> Tuple[str, int, torch.Tensor]: + col_name, array_index, array = array_item + return ( + col_name, + array_index, + convert_ndarray_batch_to_torch_tensor_batch( + array, + dtypes=dtypes[col_name] if isinstance(dtypes, dict) else dtypes, + pin_memory=pin_memory, + ), + ) + + # Flatten arrays with column name and index for parallel processing + array_items = [ + (col_name, idx, array) + for col_name, arrays in numpy_list.items() + for idx, array in enumerate(arrays) + ] + + # Submit all arrays to threadpool and collect results + processed_arrays = list(threadpool.map(process_array, array_items)) + + # Initialize result with all columns from numpy_list, including empty ones + # Pre-allocate lists of the correct size for each column + result: Dict[str, List[torch.Tensor]] = { + col_name: [None] * len(arrays) + for col_name, arrays in numpy_list.items() + } + + # Populate result with processed tensors + for col_name, array_index, tensor in processed_arrays: + result[col_name][array_index] = tensor + + return result + else: + # Sequential processing + return convert_ndarray_list_to_torch_tensor_list( + numpy_list, + dtypes=dtypes, + pin_memory=pin_memory, + ) + + +@torch.no_grad() +def concat_tensors_to_device( + tensor_sequence: Sequence[torch.Tensor], + device: Optional[Union[str, "torch.device"]] = None, + non_blocking: bool = DEFAULT_TENSOR_NON_BLOCKING_TRANSFER, +) -> torch.Tensor: + """Stack sequence of tensors into a contiguous GPU tensor. + + Args: + tensor_sequence: Sequence of tensors to stack + device: The device to move tensors to. If None, tensors are not moved. + non_blocking: If True, perform device transfer without forcing a + synchronization. + + Returns: + A contiguous tensor on the target device + """ + # Assumes tensors have the same shape/dtype + assert ( + tensor_sequence + ), f"Cannot stack empty sequence of tensors. Received: {tensor_sequence}" + + assert all( + isinstance(t, torch.Tensor) for t in tensor_sequence + ), "All items must be torch.Tensor. Found invalid types: " + str( + [type(t) for t in tensor_sequence if not isinstance(t, torch.Tensor)] + ) + + # If there is only one tensor and its device already matches, return it directly. + if len(tensor_sequence) == 1 and ( + device is None or tensor_sequence[0].device == torch.device(device) + ): + return tensor_sequence[0] + + first_dtype = tensor_sequence[0].dtype + assert all(t.dtype == first_dtype for t in tensor_sequence), ( + "All tensors must have the same dtype. " + f"Expected: {first_dtype}, got: {[t.dtype for t in tensor_sequence]}" + ) + + first_shape = tensor_sequence[0].shape[1:] + assert all(t.shape[1:] == first_shape for t in tensor_sequence), ( + "All tensors must have the same shape[1:]. " + f"Expected: {first_shape}, got: {[t.shape[1:] for t in tensor_sequence]}" + ) + + first = tensor_sequence[0] + dtype = first.dtype + shape_tail = first.shape[1:] + total_rows = sum(t.shape[0] for t in tensor_sequence) + + # Allocate an empty Tensor on device + result = torch.empty((total_rows, *shape_tail), dtype=dtype, device=device) + + row_start = 0 + for t in tensor_sequence: + row_end = row_start + t.shape[0] + result[row_start:row_end].copy_(t, non_blocking=non_blocking) + row_start = row_end + + return result + + +def _get_type_str(batch: Any) -> str: + """Get a string representation of the possibly nested type of the batch. + + >>> import torch + >>> _get_type_str([1, 2, "???"]) + 'list[int | str]' + >>> _get_type_str({"a": [1, 2, 3], "b": 4}) + 'dict[str, int | list[int]]' + >>> _get_type_str({"a": torch.tensor(1), "b": [torch.tensor(2)]}) + 'dict[str, Tensor | list[Tensor]]' + >>> _get_type_str({"a": torch.tensor(1), "b": {"c": torch.tensor(2)}}) + 'dict[str, Tensor | dict[str, Tensor]]' + """ + curr_type = type(batch).__name__ + if isinstance(batch, (list, tuple)): + val_types = " | ".join(sorted({_get_type_str(v) for v in batch})) + invalid_type_str = f"{curr_type}[{val_types}]" + elif isinstance(batch, dict): + val_types = " | ".join(sorted({_get_type_str(v) for v in batch.values()})) + invalid_type_str = f"{curr_type}[str, {val_types}]" + else: + invalid_type_str = curr_type + return invalid_type_str + + +@torch.no_grad() +def move_tensors_to_device( + batch: TensorBatchType, + device: Optional[Union[str, "torch.device"]] = None, + non_blocking: bool = DEFAULT_TENSOR_NON_BLOCKING_TRANSFER, +) -> TensorBatchReturnType: + """Move tensors to the specified device. + + Concatenate nested lists/tuples of tensors along the first (batch) dimension. + For example, for the input + ((feature_0_chunk_0,), (feature_1_chunk_0, feature_1_chunk_1)) + the output will be (feature_0_chunk_0, feature_1_chunk_0+1) + where each feature is concatenated along the batch dimension. + + Args: + batch: A tensor or collection of tensors to move to device. Can be: + - A single tensor + - A sequence of tensors + - A sequence of sequences of tensors. The inner sequence of tensors is + combined during GPU transfer. + - A mapping (e.g., dict) of keys to tensors or sequences of tensors. The + sequence of tensors is combined during GPU transfer. + device: The device to move tensors to. If None, tensors are not moved. + non_blocking: If True, perform device transfer without forcing a + synchronization. + + Returns: + The input tensors moved to the specified device + """ + if device is None: + return batch + + if _is_tensor(batch): + return batch.to(device, non_blocking=non_blocking) + elif _is_tensor_sequence(batch): + return type(batch)([t.to(device, non_blocking=non_blocking) for t in batch]) + elif _is_nested_tensor_sequence(batch): + return type(batch)( + [concat_tensors_to_device(t, device, non_blocking) for t in batch] + ) + elif _is_tensor_mapping(batch): + return {k: t.to(device, non_blocking=non_blocking) for k, t in batch.items()} + elif _is_tensor_sequence_mapping(batch): + return { + k: concat_tensors_to_device(v, device, non_blocking) + for k, v in batch.items() + } + else: + raise ValueError( + f"Invalid input type: {_get_type_str(batch)}.\n" + "Expected one of the following: " + "torch.Tensor, " + "List/Tuple[torch.Tensor], " + "Dict[str, torch.Tensor], " + "Mapping[str, List/Tuple[torch.Tensor]]" + ) diff --git a/python/ray/train/tests/test_torch_detection_predictor.py b/python/ray/train/tests/test_torch_detection_predictor.py index 6af692aeaf2c..c3a8d9878420 100644 --- a/python/ray/train/tests/test_torch_detection_predictor.py +++ b/python/ray/train/tests/test_torch_detection_predictor.py @@ -2,7 +2,7 @@ import pytest from torchvision import models -from ray.air.util.tensor_extensions.utils import create_ragged_ndarray +from ray.data._internal.tensor_extensions.utils import create_ragged_ndarray from ray.train.torch import TorchDetectionPredictor