Fix: S3 read text with version ID was not working (#1587)

LeonLuttenberger · web-flow · commit 938e83cee998 · 2022-09-09T12:52:00.000-05:00
diff --git a/awswrangler/s3/_read_text.py b/awswrangler/s3/_read_text.py
@@ -33,6 +33,13 @@
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
+def _get_version_id_for(version_id: Optional[Union[str, Dict[str, str]]], path: str) -> Optional[str]:
+    if isinstance(version_id, dict):
+        return version_id.get(path, None)
+
+    return version_id
+
+
 def _get_read_details(path: str, pandas_kwargs: Dict[str, Any]) -> Tuple[str, Optional[str], Optional[str]]:
     if pandas_kwargs.get("compression", "infer") == "infer":
         pandas_kwargs["compression"] = infer_compression(path, compression="infer")
@@ -52,7 +59,7 @@ def _read_text_chunked(
     s3_additional_kwargs: Optional[Dict[str, str]],
     dataset: bool,
     use_threads: Union[bool, int],
-    version_ids: Optional[Dict[str, str]] = None,
+    version_ids: Optional[Dict[str, Optional[str]]] = None,
 ) -> Iterator[pd.DataFrame]:
     for path in paths:
         _logger.debug("path: %s", path)
@@ -157,19 +164,21 @@ def _read_text(
     }
     _logger.debug("args:\n%s", pprint.pformat(args))
 
-    if chunksize is not None:
-        return _read_text_chunked(
-            paths=paths, version_ids=version_id if isinstance(version_id, dict) else None, chunksize=chunksize, **args
+    if len(paths) > 1 and version_id is not None and not isinstance(version_id, dict):
+        raise exceptions.InvalidArgumentCombination(
+            "If multiple paths are provided along with a file version ID, the version ID parameter must be a dict."
         )
+    version_id_dict = {path: _get_version_id_for(version_id, path) for path in paths}
 
-    version_id = version_id if isinstance(version_id, dict) else None
+    if chunksize is not None:
+        return _read_text_chunked(paths=paths, version_ids=version_id_dict, chunksize=chunksize, **args)
 
     executor = _get_executor(use_threads=use_threads)
     tables = executor.map(
         _read_text_file,
         session,
         paths,
-        itertools.repeat(version_id),
+        [version_id_dict[path] for path in paths],
         itertools.repeat(parser_func),
         itertools.repeat(path_root),
         itertools.repeat(pandas_kwargs),
diff --git a/tests/unit/test_s3_text.py b/tests/unit/test_s3_text.py
@@ -325,9 +325,14 @@ def test_read_json_versioned(path) -> None:
         pd.DataFrame({"id": [4, 5, 6], "value": ["foo", "boo", "bar"]}),
         pd.DataFrame({"id": [1, 2, 3], "value": ["a", "b", "c"]}),
     ]
+    version_ids = []
+
     for df in dfs:
         wr.s3.to_json(df=df, path=path_file)
         version_id = wr.s3.describe_objects(path=path_file)[path_file]["VersionId"]
+        version_ids.append(version_id)
+
+    for df, version_id in zip(dfs, version_ids):
         df_temp = wr.s3.read_json(path_file, version_id=version_id)
         assert df_temp.equals(df)
         assert version_id == wr.s3.describe_objects(path=path_file, version_id=version_id)[path_file]["VersionId"]
@@ -339,9 +344,14 @@ def test_read_csv_versioned(path) -> None:
         pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5]}),
         pd.DataFrame({"c0": [3, 4, 5], "c1": [6, 7, 8]}),
     ]
+    version_ids = []
+
     for df in dfs:
         wr.s3.to_csv(df=df, path=path_file, index=False)
         version_id = wr.s3.describe_objects(path=path_file)[path_file]["VersionId"]
+        version_ids.append(version_id)
+
+    for df, version_id in zip(dfs, version_ids):
         df_temp = wr.s3.read_csv(path_file, version_id=version_id)
         assert df_temp.equals(df)
         assert version_id == wr.s3.describe_objects(path=path_file, version_id=version_id)[path_file]["VersionId"]