Skip to content

Commit babf835

Browse files
Refactoring _delete_objects
1 parent 87a92e2 commit babf835

File tree

2 files changed

+27
-20
lines changed

2 files changed

+27
-20
lines changed

awswrangler/s3/_delete.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,34 +19,33 @@
1919
def _split_paths_by_bucket(paths: List[str]) -> Dict[str, List[str]]:
2020
buckets: Dict[str, List[str]] = {}
2121
bucket: str
22-
key: str
2322
for path in paths:
24-
bucket, key = _utils.parse_path(path=path)
23+
bucket = _utils.parse_path(path=path)[0]
2524
if bucket not in buckets:
2625
buckets[bucket] = []
27-
buckets[bucket].append(key)
26+
buckets[bucket].append(path)
2827
return buckets
2928

3029

3130
@ray_remote
3231
def _delete_objects(
3332
boto3_session: Optional[boto3.Session],
34-
bucket: str,
35-
keys: List[str],
33+
paths: List[str],
3634
s3_additional_kwargs: Optional[Dict[str, Any]],
3735
) -> None:
3836
client_s3: boto3.client = _utils.client(
3937
service_name="s3",
4038
session=boto3_session,
4139
)
42-
_logger.debug("len(keys): %s", len(keys))
43-
batch: List[Dict[str, str]] = [{"Key": key} for key in keys]
40+
_logger.debug("len(paths): %s", len(paths))
4441
if s3_additional_kwargs:
4542
extra_kwargs: Dict[str, Any] = get_botocore_valid_kwargs(
4643
function_name="list_objects_v2", s3_additional_kwargs=s3_additional_kwargs
4744
)
4845
else:
4946
extra_kwargs = {}
47+
bucket = _utils.parse_path(path=paths[0])[0]
48+
batch: List[Dict[str, str]] = [{"Key": _utils.parse_path(path)[1]} for path in paths]
5049
res = client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch}, **extra_kwargs)
5150
deleted: List[Dict[str, Any]] = res.get("Deleted", [])
5251
for obj in deleted:
@@ -118,11 +117,16 @@ def delete_objects(
118117
last_modified_end=last_modified_end,
119118
s3_additional_kwargs=s3_additional_kwargs,
120119
)
121-
122-
buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths)
123-
for bucket, keys in buckets.items():
124-
chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000)
125-
executor = _get_executor(use_threads=use_threads)
126-
executor.map(
127-
_delete_objects, boto3_session, itertools.repeat(bucket), chunks, itertools.repeat(s3_additional_kwargs)
128-
)
120+
paths_by_bucket: Dict[List[str, List[str]]] = _split_paths_by_bucket(paths)
121+
122+
chunks = []
123+
for bucket in paths_by_bucket:
124+
chunks += _utils.chunkify(lst=paths_by_bucket[bucket], max_length=5)
125+
126+
executor = _get_executor(use_threads=use_threads)
127+
executor.map(
128+
_delete_objects,
129+
boto3_session,
130+
chunks,
131+
itertools.repeat(s3_additional_kwargs),
132+
)

tests/test_s3.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,15 @@ def test_list_by_last_modified_date(path):
6969
assert len(wr.s3.read_json(path, last_modified_begin=begin_utc, last_modified_end=end_utc).index) == 6
7070

7171

72-
def test_s3_delete_objects(path):
72+
def test_s3_delete_objects(path, path2):
7373
df = pd.DataFrame({"id": [1, 2, 3]})
74-
for i in range(10):
75-
wr.s3.to_json(df, f"s3://{path}delete-test{i}.json")
76-
77-
wr.s3.delete_objects(path=f"{path}delete-test*")
74+
objects_per_bucket = 10
75+
paths = [f"s3://{path}delete-test{i}.json" for i in range(objects_per_bucket)] + [
76+
f"s3://{path2}delete-test{i}.json" for i in range(objects_per_bucket)
77+
]
78+
for path in paths:
79+
wr.s3.to_json(df, path)
80+
wr.s3.delete_objects(path=paths)
7881
time.sleep(5) # s3 read-consistency
7982

8083
assert len(wr.s3.list_objects(f"{path}delete-test*")) == 0

0 commit comments

Comments
 (0)