Backward-compatible partition handling

kukushking · kukushking · commit d62a4757250a · 2022-09-27T14:38:47.000+01:00
diff --git a/awswrangler/distributed/_distributed.py b/awswrangler/distributed/_distributed.py
@@ -132,7 +132,7 @@ def initialize_ray(
     cpu_count : Optional[int]
         Number of CPUs to assign to each raylet, by default None
     gpu_count : Optional[int]
-        Number of GPUs to assign to each raylet, by default 0
+        Number of GPUs to assign to each raylet, by default None
     """
     if not ray.is_initialized():
         # Detect an existing cluster
diff --git a/awswrangler/distributed/datasources/parquet_datasource.py b/awswrangler/distributed/datasources/parquet_datasource.py
@@ -1,25 +1,95 @@
 """Distributed ParquetDatasource Module."""
 
 import logging
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, Iterator, List, Optional, Union
 
 # fs required to implicitly trigger S3 subsystem initialization
 import pyarrow.fs  # noqa: F401 pylint: disable=unused-import
+import ray
+from ray.data._internal.output_buffer import BlockOutputBuffer
 from ray.data._internal.remote_fn import cached_remote_fn
 from ray.data.block import Block, BlockAccessor, BlockMetadata
+from ray.data.context import DatasetContext
 from ray.data.datasource import BlockWritePathProvider, DefaultBlockWritePathProvider, Reader
 from ray.data.datasource.datasource import WriteResult
 from ray.data.datasource.file_based_datasource import (
     _resolve_paths_and_filesystem,
     _S3FileSystemWrapper,
     _wrap_s3_serialization_workaround,
 )
-from ray.data.datasource.parquet_datasource import _ParquetDatasourceReader
+from ray.data.datasource.parquet_datasource import (
+    PARQUET_READER_ROW_BATCH_SIZE,
+    _deserialize_pieces_with_retry,
+    _ParquetDatasourceReader,
+    _SerializedPiece,
+)
 from ray.types import ObjectRef
 
+from awswrangler._arrow import _add_table_partitions
+
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
+def _read_pieces(
+    block_udf: Optional[Callable[[Block[Any]], Block[Any]]],
+    reader_args: Any,
+    columns: Optional[List[str]],
+    schema: Optional[Union[type, "pyarrow.lib.Schema"]],
+    serialized_pieces: List[_SerializedPiece],
+) -> Iterator["pyarrow.Table"]:
+    # This import is necessary to load the tensor extension type.
+    from ray.data.extensions.tensor_extension import (  # type: ignore # noqa: F401, E501 # pylint: disable=import-outside-toplevel, unused-import
+        ArrowTensorType,
+    )
+
+    # Deserialize after loading the filesystem class.
+    pieces: List["pyarrow._dataset.ParquetFileFragment"] = _deserialize_pieces_with_retry(serialized_pieces)
+
+    # Ensure that we're reading at least one dataset fragment.
+    assert len(pieces) > 0
+
+    import pyarrow as pa  # pylint: disable=import-outside-toplevel
+
+    ctx = DatasetContext.get_current()
+    output_buffer = BlockOutputBuffer(
+        block_udf=block_udf,
+        target_max_block_size=ctx.target_max_block_size,
+    )
+
+    _logger.debug("Reading %s parquet pieces", len(pieces))
+    use_threads = reader_args.pop("use_threads", False)
+    path_root = reader_args.pop("path_root", None)
+    for piece in pieces:
+        batches = piece.to_batches(
+            use_threads=use_threads,
+            columns=columns,
+            schema=schema,
+            batch_size=PARQUET_READER_ROW_BATCH_SIZE,
+            **reader_args,
+        )
+        for batch in batches:
+            # Table creation is wrapped inside _add_table_partitions
+            # to add columns with partition values when dataset=True
+            # and cast them to categorical
+            table = _add_table_partitions(
+                table=pa.Table.from_batches([batch], schema=schema),
+                path=f"s3://{piece.path}",
+                path_root=path_root,
+            )
+            # If the table is empty, drop it.
+            if table.num_rows > 0:
+                output_buffer.add_block(table)
+                if output_buffer.has_next():
+                    yield output_buffer.next()
+    output_buffer.finalize()
+    if output_buffer.has_next():
+        yield output_buffer.next()
+
+
+# Patch _read_pieces function
+ray.data.datasource.parquet_datasource._read_pieces = _read_pieces  # pylint: disable=protected-access
+
+
 class UserProvidedKeyBlockWritePathProvider(BlockWritePathProvider):
     """Block write path provider.
 
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -350,6 +350,7 @@ def _read_parquet(
             schema=schema,
             columns=columns,
             dataset_kwargs=dataset_kwargs,
+            path_root=path_root,
         )
         return _to_modin(dataset=dataset, to_pandas_kwargs=arrow_kwargs)
 
@@ -475,7 +476,7 @@ def read_parquet(
         If integer is provided, specified number is used.
     parallelism : int, optional
         The requested parallelism of the read. Only used when `distributed` add-on is installed.
-        Parallelism may be limited by the number of files of the dataset. 200 by default.
+        Parallelism may be limited by the number of files of the dataset. -1 (autodetect) by default.
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session is used if None is received.
     s3_additional_kwargs : Optional[Dict[str, Any]]