Skip to content

Commit 510c9dc

Browse files
S3 loader to use boto3 built-in credential configuration (#723)
* S3Loader: allow authenticated S3 access using boto3 built-in configuration methods without explicitly passing credentials, cf. https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials * S3Loader tests: re-enable tests reading from s3://commoncrawl/ in order to test authenticated reads. Tests are skipped if no AWS credentials are configured.
1 parent fbed87a commit 510c9dc

File tree

2 files changed

+23
-11
lines changed

2 files changed

+23
-11
lines changed

pywb/utils/loaders.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,8 @@ class BlockLoader(BaseLoader):
185185
"""
186186
a loader which can stream blocks of content
187187
given a uri, offset and optional length.
188-
Currently supports: http/https and file/local file system
188+
Currently supports: http/https, file/local file system,
189+
pkg, WebHDFS, S3
189190
"""
190191

191192
loaders = {}
@@ -393,14 +394,15 @@ def load(self, url, offset, length):
393394

394395
def s3_load(anon=False):
395396
if not self.client:
397+
s3_client_args = {}
396398
if anon:
397-
config = Config(signature_version=UNSIGNED)
398-
else:
399-
config = None
399+
s3_client_args['config'] = Config(signature_version=UNSIGNED)
400+
if aws_access_key_id:
401+
s3_client_args['aws_access_key_id'] = aws_access_key_id
402+
s3_client_args['aws_secret_access_key'] = aws_secret_access_key
403+
404+
client = boto3.client('s3', **s3_client_args)
400405

401-
client = boto3.client('s3', aws_access_key_id=aws_access_key_id,
402-
aws_secret_access_key=aws_secret_access_key,
403-
config=config)
404406
else:
405407
client = self.client
406408

pywb/utils/test/test_loaders.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,19 @@
9797

9898
test_cdx_dir = get_test_dir() + 'cdx/'
9999

100-
@pytest.mark.skip("skip for now, made need different s3 source")
101-
def test_s3_read_1():
100+
def s3_authenticated_access_verification(bucket):
101+
import boto3, botocore
102+
s3_client = boto3.client('s3')
103+
try:
104+
s3_client.head_bucket(Bucket=bucket)
105+
except botocore.exceptions.NoCredentialsError:
106+
pytest.skip("Skipping S3Loader test for authenticated reads: no credentials configured")
107+
108+
def test_s3_read_authenticated_1():
102109
pytest.importorskip('boto3')
103110

111+
s3_authenticated_access_verification('commoncrawl')
112+
104113
res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
105114
offset=53235662,
106115
length=2526)
@@ -112,10 +121,11 @@ def test_s3_read_1():
112121
assert reader.readline() == b'WARC/1.0\r\n'
113122
assert reader.readline() == b'WARC-Type: response\r\n'
114123

115-
@pytest.mark.skip("skip for now, made need different s3 source")
116-
def test_s3_read_2():
124+
def test_s3_read_authenticated_2():
117125
pytest.importorskip('boto3')
118126

127+
s3_authenticated_access_verification('commoncrawl')
128+
119129
res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html')
120130

121131
buff = res.read()

0 commit comments

Comments
 (0)