Merge branch 'main' into fix/iceberg-new-cols

Braalfa · web-flow · commit ddeee46a0304 · 2025-09-04T18:00:08.000-06:00
diff --git a/awswrangler/distributed/ray/_executor.py b/awswrangler/distributed/ray/_executor.py
@@ -39,7 +39,7 @@ def __init__(self, max_concurrency: int) -> None:
         super().__init__()
 
         _logger.debug("Initializing Ray Actor with maximum concurrency %d", max_concurrency)
-        self._actor: ray.actor.ActorHandle = AsyncActor.options(max_concurrency=max_concurrency).remote()  # type: ignore[attr-defined]
+        self._actor: "ray.actor.ActorHandle[AsyncActor]" = AsyncActor.options(max_concurrency=max_concurrency).remote()  # type: ignore[attr-defined]
 
     def map(self, func: Callable[..., MapOutputType], _: "BaseClient" | None, *args: Any) -> list[MapOutputType]:
         """Map func and return ray futures."""
diff --git a/awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py b/awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py
@@ -344,7 +344,6 @@ def get_read_tasks(self, parallelism: int) -> list[ReadTask]:
 
             meta = self._meta_provider(
                 paths,  # type: ignore[arg-type]
-                self._inferred_schema,
                 num_fragments=len(fragments),
                 prefetched_metadata=metadata,
             )
@@ -566,7 +565,7 @@ def compute_batch_size_rows(sample_info: _SampleInfo) -> int:
         if sample_info.actual_bytes_per_row is None:
             return PARQUET_READER_ROW_BATCH_SIZE
         else:
-            max_parquet_reader_row_batch_size_bytes = DataContext.get_current().target_max_block_size // 10
+            max_parquet_reader_row_batch_size_bytes = DataContext.get_current().target_max_block_size // 10  # type: ignore[operator]
             return max(
                 1,
                 min(
diff --git a/awswrangler/distributed/ray/datasources/file_datasink.py b/awswrangler/distributed/ray/datasources/file_datasink.py
@@ -78,7 +78,7 @@ def _write_block(write_path: str, block: pd.DataFrame) -> str:
         write_path = self.path
 
         if write_path.endswith("/"):
-            filename = self.filename_provider.get_filename_for_block(block, ctx.task_idx, 0)
+            filename = self.filename_provider.get_filename_for_block(block, "", ctx.task_idx, 0)
             write_path = posixpath.join(self.path, filename)
 
         return _write_block(write_path, block)
diff --git a/awswrangler/distributed/ray/datasources/filename_provider.py b/awswrangler/distributed/ray/datasources/filename_provider.py
@@ -26,13 +26,16 @@ def __init__(
     def get_filename_for_block(
         self,
         block: Block,
+        write_uuid: str,
         task_index: int,
         block_index: int,
     ) -> str:
         file_id = f"{task_index:06}_{block_index:06}"
         return self._generate_filename(file_id)
 
-    def get_filename_for_row(self, row: dict[str, Any], task_index: int, block_index: int, row_index: int) -> str:
+    def get_filename_for_row(
+        self, row: dict[str, Any], write_uuid: str, task_index: int, block_index: int, row_index: int
+    ) -> str:
         file_id = f"{task_index:06}_{block_index:06}_{row_index:06}"
         return self._generate_filename(file_id)
 
diff --git a/awswrangler/mysql.py b/awswrangler/mysql.py
@@ -168,7 +168,7 @@ def connect(
     attrs: _db_utils.ConnectionAttributes = _db_utils.get_connection_attributes(
         connection=connection, secret_id=secret_id, catalog_id=catalog_id, dbname=dbname, boto3_session=boto3_session
     )
-    if attrs.kind != "mysql":
+    if attrs.kind not in ("mysql", "aurora-mysql"):
         raise exceptions.InvalidDatabaseType(f"Invalid connection type ({attrs.kind}. It must be a MySQL connection.)")
     return pymysql.connect(
         user=attrs.user,
diff --git a/awswrangler/postgresql.py b/awswrangler/postgresql.py
@@ -225,7 +225,7 @@ def connect(
     attrs: _db_utils.ConnectionAttributes = _db_utils.get_connection_attributes(
         connection=connection, secret_id=secret_id, catalog_id=catalog_id, dbname=dbname, boto3_session=boto3_session
     )
-    if attrs.kind not in ("postgresql", "postgres"):
+    if attrs.kind not in ("postgresql", "postgres", "aurora-postgresql"):
         raise exceptions.InvalidDatabaseType(
             f"Invalid connection type ({attrs.kind}. It must be a postgresql connection.)"
         )
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,10 +23,7 @@ dependencies = [
     "botocore>=1.23.32,<2",
     "pandas>=1.2.0,<3.0.0",
     "numpy>=1.26,<3.0",
-    "pyarrow>=18.0.0,<21.0.0 ; sys_platform != 'darwin' or platform_machine != 'x86_64'",
-    # pyarrow 18 causes macos build failures
-    # https://github.com/ray-project/ray/pull/48446
-    "pyarrow>=8.0.0,<18.0.0 ; sys_platform == 'darwin' and platform_machine == 'x86_64'",
+    "pyarrow>=8.0.0,<21.0.0",
     "typing-extensions>=4.4.0,<5",
     "packaging>=21.1,<26.0",
     "setuptools ; python_version >= '3.12'",
@@ -41,7 +38,7 @@ oracle = ["oracledb>=1,<4"]
 gremlin = [
     "gremlinpython>=3.7.1,<4",
     "requests>=2.0.0,<3",
-    "aiohttp>=3.9.0,<4",
+    "aiohttp>=3.12.14,<4",
     "async-timeout>=4.0.3,<6.0.0",
 ]
 sparql = [
@@ -58,8 +55,8 @@ openpyxl = ["openpyxl>=3.0.0,<4"]
 progressbar = ["progressbar2>=4.0.0,<5"]
 deltalake = ["deltalake>=0.18.0,<1.2.0"]
 geopandas = ["geopandas>=1.0.0,<2"]
-modin = ["modin>=0.31,<0.35"]
-ray = ["ray[default, data]>=2.45.0,<3"]
+modin = ["modin>=0.31,<0.36"]
+ray = ["ray[default, data]>=2.49.0,<3"]
 
 [project.urls]
 Homepage = "https://aws-sdk-pandas.readthedocs.io/"
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,7 @@ def connect(`
`168`	`168`	`attrs: _db_utils.ConnectionAttributes = _db_utils.get_connection_attributes(`
`169`	`169`	`connection=connection, secret_id=secret_id, catalog_id=catalog_id, dbname=dbname, boto3_session=boto3_session`
`170`	`170`	`)`
`171`		`- if attrs.kind != "mysql":`
	`171`	`+ if attrs.kind not in ("mysql", "aurora-mysql"):`
`172`	`172`	`raise exceptions.InvalidDatabaseType(f"Invalid connection type ({attrs.kind}. It must be a MySQL connection.)")`
`173`	`173`	`return pymysql.connect(`
`174`	`174`	`user=attrs.user,`
Original file line number	Diff line number	Diff line change
`@@ -225,7 +225,7 @@ def connect(`
`225`	`225`	`attrs: _db_utils.ConnectionAttributes = _db_utils.get_connection_attributes(`
`226`	`226`	`connection=connection, secret_id=secret_id, catalog_id=catalog_id, dbname=dbname, boto3_session=boto3_session`
`227`	`227`	`)`
`228`		`- if attrs.kind not in ("postgresql", "postgres"):`
	`228`	`+ if attrs.kind not in ("postgresql", "postgres", "aurora-postgresql"):`
`229`	`229`	`raise exceptions.InvalidDatabaseType(`
`230`	`230`	`f"Invalid connection type ({attrs.kind}. It must be a postgresql connection.)"`
`231`	`231`	`)`