28
28
import os .path
29
29
from hashlib import sha1
30
30
from pathlib import Path
31
- from typing import List
31
+ from typing import List , Optional
32
32
33
33
from tenacity import retry , stop_after_attempt , wait_random_exponential
34
34
35
35
import aioboto3
36
36
from boto3 .s3 .transfer import TransferConfig
37
+ import asyncio
37
38
38
39
from servicex .models import ResultFile , TransformStatus
39
40
40
- _transferconfig = TransferConfig (max_concurrency = 10 )
41
+ _transferconfig = TransferConfig (max_concurrency = 5 )
42
+ _sem = asyncio .Semaphore (10 )
43
+ _bucket_list_sem = asyncio .Semaphore (5 )
41
44
42
45
43
46
def init_s3_config (concurrency : int = 10 ):
44
47
"Update the number of concurrent connections"
45
- global _transferconfig
46
- _transferconfig = TransferConfig ( max_concurrency = concurrency )
48
+ global _sem
49
+ _sem = asyncio . Semaphore ( concurrency )
47
50
48
51
49
52
def _sanitize_filename (fname : str ):
@@ -85,24 +88,31 @@ def for_transform(cls, transform: TransformStatus):
85
88
stop = stop_after_attempt (3 ), wait = wait_random_exponential (max = 60 ), reraise = True
86
89
)
87
90
async def list_bucket (self ) -> List [ResultFile ]:
88
- async with self .minio .resource ("s3" , endpoint_url = self .endpoint_host ) as s3 :
89
- bucket = await s3 .Bucket (self .bucket )
90
- objects = bucket .objects .all ()
91
- return [
92
- ResultFile (
93
- filename = obj .key ,
94
- size = await obj .size ,
95
- extension = obj .key .split ("." )[- 1 ],
96
- )
97
- async for obj in objects
98
- if not obj .key .endswith ("/" )
99
- ]
91
+ async with _bucket_list_sem :
92
+ async with self .minio .client ("s3" , endpoint_url = self .endpoint_host ) as s3 :
93
+ paginator = s3 .get_paginator ("list_objects_v2" )
94
+ pagination = paginator .paginate (Bucket = self .bucket )
95
+ listing = await pagination .build_full_result ()
96
+ rv = [
97
+ ResultFile (
98
+ filename = _ ["Key" ],
99
+ size = _ ["Size" ],
100
+ extension = _ ["Key" ].split ("." )[- 1 ],
101
+ )
102
+ for _ in listing ["Contents" ]
103
+ if not _ ["Key" ].endswith ("/" )
104
+ ]
105
+ return rv
100
106
101
107
@retry (
102
108
stop = stop_after_attempt (3 ), wait = wait_random_exponential (max = 60 ), reraise = True
103
109
)
104
110
async def download_file (
105
- self , object_name : str , local_dir : str , shorten_filename : bool = False
111
+ self ,
112
+ object_name : str ,
113
+ local_dir : str ,
114
+ shorten_filename : bool = False ,
115
+ expected_size : Optional [int ] = None ,
106
116
) -> Path :
107
117
os .makedirs (local_dir , exist_ok = True )
108
118
path = Path (
@@ -114,16 +124,26 @@ async def download_file(
114
124
)
115
125
)
116
126
117
- async with self .minio .resource ("s3" , endpoint_url = self .endpoint_host ) as s3 :
118
- obj = await s3 .Object (self .bucket , object_name )
119
- remotesize = await obj .content_length
127
+ async with self .minio .client ("s3" , endpoint_url = self .endpoint_host ) as s3 :
128
+ if expected_size is not None :
129
+ remotesize = expected_size
130
+ else :
131
+ async with _sem :
132
+ info = await s3 .head_object (Bucket = self .bucket , Key = object_name )
133
+ remotesize = info ["ContentLength" ]
120
134
if path .exists ():
121
135
# if file size is the same, let's not download anything
122
136
# maybe move to a better verification mechanism with e-tags in the future
123
137
localsize = path .stat ().st_size
124
138
if localsize == remotesize :
125
139
return path .resolve ()
126
- await obj .download_file (path .as_posix (), Config = _transferconfig )
140
+ async with _sem :
141
+ await s3 .download_file (
142
+ Bucket = self .bucket ,
143
+ Key = object_name ,
144
+ Filename = path .as_posix (),
145
+ Config = _transferconfig ,
146
+ )
127
147
localsize = path .stat ().st_size
128
148
if localsize != remotesize :
129
149
raise RuntimeError (f"Download of { object_name } failed" )
0 commit comments