Skip to content

Commit e076b05

Browse files
Merge pull request #1 from matthewhanson/develop
publish 0.0.3
2 parents a6f10da + abee142 commit e076b05

File tree

6 files changed

+220
-8
lines changed

6 files changed

+220
-8
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,16 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
66

77
## [Unreleased]
88

9+
## [v0.0.3] = 2019-11-22
10+
11+
### Added
12+
- s3.latest_inventory iterator function added for looping through matching files in an s3 inventory
13+
- s3.get_presigned_url for generating a presigned URL for s3....does not use boto3
14+
915
## [v0.0.2] - 2019-10-08
1016

1117
Initial Release
1218

1319
[Unreleased]: https://github.com/matthewhanson/boto3-utils/compare/master...develop
20+
[v0.0.3]: https://github.com/matthewhanson/boto3-utils/compare/0.0.2...0.0.3
1421
[v0.0.2]: https://github.com/matthewhanson/boto3-utils/tree/0.0.2

boto3utils/s3.py

Lines changed: 134 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
import boto3
22
import json
3+
import hashlib
4+
import hmac
35
import logging
6+
import os
47

58
import os.path as op
69

710
from botocore.exceptions import ClientError
811
from datetime import datetime, timedelta
9-
from dateutil.parser import parse
1012
from gzip import GzipFile
1113
from io import BytesIO
1214
from os import makedirs, getenv
@@ -85,18 +87,22 @@ def download(uri, path=''):
8587
return fout
8688

8789

88-
def read_json(url):
89-
"""
90-
Download object from S3 as JSON
91-
"""
90+
def read(url):
91+
""" Read object from s3 """
9292
parts = urlparse(url)
9393
response = s3.get_object(Bucket=parts['bucket'], Key=parts['key'])
9494
body = response['Body'].read()
9595
if op.splitext(parts['key'])[1] == '.gz':
9696
body = GzipFile(None, 'rb', fileobj=BytesIO(body)).read()
97-
return json.loads(body.decode('utf-8'))
97+
return body.decode('utf-8')
9898

9999

100+
def read_json(url):
101+
""" Download object from S3 as JSON """
102+
return json.loads(read(url))
103+
104+
105+
# function derived from https://alexwlchan.net/2018/01/listing-s3-keys-redux/
100106
def find(url, suffix=''):
101107
"""
102108
Generate objects in an S3 bucket.
@@ -132,3 +138,125 @@ def find(url, suffix=''):
132138
kwargs['ContinuationToken'] = resp['NextContinuationToken']
133139
except KeyError:
134140
break
141+
142+
143+
def latest_inventory(url, prefix=None, suffix=None, start_date=None, end_date=None, datetime_key='LastModifiedDate'):
144+
""" Return generator function for objects in Bucket with suffix (all files if suffix=None) """
145+
parts = urlparse(url)
146+
# get latest manifest file
147+
today = datetime.now()
148+
manifest_key = None
149+
for dt in [today, today - timedelta(1)]:
150+
_key = op.join(parts['key'], dt.strftime('%Y-%m-%d'))
151+
_url = 's3://%s/%s' % (parts['bucket'], _key)
152+
keys = [k for k in find(_url, suffix='manifest.json')]
153+
if len(keys) == 1:
154+
manifest_key = keys[0]
155+
break
156+
# read through latest manifest looking for matches
157+
if manifest_key:
158+
_url = 's3://%s/%s' % (parts['bucket'], manifest_key)
159+
manifest = read_json(_url)
160+
# get file schema
161+
keys = [str(key).strip() for key in manifest['fileSchema'].split(',')]
162+
163+
logger.info('Getting latest inventory from %s' % url)
164+
counter = 0
165+
for f in manifest.get('files', []):
166+
_url = 's3://%s/%s' % (parts['bucket'], f['key'])
167+
inv = read(_url).split('\n')
168+
for line in inv:
169+
counter += 1
170+
if counter % 100000 == 0:
171+
logger.debug('%s: Scanned %s records' % (datetime.now(), str(counter)))
172+
info = {keys[i]: v for i, v in enumerate(line.replace('"', '').split(','))}
173+
if 'Key' not in info:
174+
continue
175+
# skip to next if last modified date not between start_date and end_date
176+
dt = datetime.strptime(info[datetime_key], "%Y-%m-%dT%H:%M:%S.%fZ").date()
177+
if (start_date is not None and dt < start_date) or (end_date is not None and dt > end_date):
178+
continue
179+
if prefix is not None:
180+
# if path doesn't match provided prefix skip to next record
181+
if info['Key'][:len(prefix)] != prefix:
182+
continue
183+
if suffix is None or info['Key'].endswith(suffix):
184+
if 'Bucket' in keys and 'Key' in keys:
185+
info['url'] = 's3://%s/%s' % (info['Bucket'], info['Key'])
186+
yield info
187+
188+
189+
def get_presigned_url(url, aws_region=None,
190+
rtype='GET', public=False, requester_pays=False, content_type=None):
191+
""" Get presigned URL """
192+
access_key = os.environ.get('AWS_BUCKET_ACCESS_KEY_ID', os.environ.get('AWS_ACCESS_KEY_ID'))
193+
secret_key = os.environ.get('AWS_BUCKET_SECRET_ACCESS_KEY', os.environ.get('AWS_SECRET_ACCESS_KEY'))
194+
region = os.environ.get('AWS_BUCKET_REGION', os.environ.get('AWS_REGION', 'eu-central-1'))
195+
if aws_region is not None:
196+
region = aws_region
197+
if access_key is None or secret_key is None:
198+
# if credentials not provided, just try to download without signed URL
199+
logger.debug('Not using signed URL for %s' % url)
200+
return url, None
201+
202+
parts = urlparse(url)
203+
bucket = parts['bucket']
204+
key = parts['key']
205+
206+
service = 's3'
207+
host = '%s.%s.amazonaws.com' % (bucket, service)
208+
request_parameters = ''
209+
210+
# Key derivation functions. See:
211+
# http://docs.aws.amazon.com/general/latest/gr/signature-v4-examples.html#signature-v4-examples-python
212+
def sign(key, msg):
213+
return hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest()
214+
215+
def getSignatureKey(key, dateStamp, regionName, serviceName):
216+
kDate = sign(('AWS4' + key).encode('utf-8'), dateStamp)
217+
kRegion = sign(kDate, regionName)
218+
kService = sign(kRegion, serviceName)
219+
kSigning = sign(kService, 'aws4_request')
220+
return kSigning
221+
222+
# Create a date for headers and the credential string
223+
t = datetime.utcnow()
224+
amzdate = t.strftime('%Y%m%dT%H%M%SZ')
225+
datestamp = t.strftime('%Y%m%d') # Date w/o time, used in credential scope
226+
227+
# create signed request and headers
228+
canonical_uri = '/' + key
229+
canonical_querystring = request_parameters
230+
231+
payload_hash = 'UNSIGNED-PAYLOAD'
232+
headers = {
233+
'host': host,
234+
'x-amz-content-sha256': payload_hash,
235+
'x-amz-date': amzdate
236+
}
237+
238+
if requester_pays:
239+
headers['x-amz-request-payer'] = 'requester'
240+
if public:
241+
headers['x-amz-acl'] = 'public-read'
242+
if os.environ.get('AWS_SESSION_TOKEN') and 'AWS_BUCKET_ACCESS_KEY_ID' not in os.environ:
243+
headers['x-amz-security-token'] = os.environ.get('AWS_SESSION_TOKEN')
244+
canonical_headers = '\n'.join('%s:%s' % (key, headers[key]) for key in sorted(headers)) + '\n'
245+
signed_headers = ';'.join(sorted(headers.keys()))
246+
247+
canonical_request = '%s\n%s\n%s\n%s\n%s\n%s' % (
248+
rtype, canonical_uri, canonical_querystring, canonical_headers, signed_headers, payload_hash
249+
)
250+
algorithm = 'AWS4-HMAC-SHA256'
251+
credential_scope = datestamp + '/' + region + '/' + service + '/' + 'aws4_request'
252+
string_to_sign = algorithm + '\n' + amzdate + '\n' + credential_scope + '\n' + hashlib.sha256(canonical_request.encode('utf-8')).hexdigest()
253+
signing_key = getSignatureKey(secret_key, datestamp, region, service)
254+
signature = hmac.new(signing_key, string_to_sign.encode('utf-8'), hashlib.sha256).hexdigest()
255+
authorization_header = algorithm + ' ' + 'Credential=' + access_key + '/' + credential_scope + ', ' \
256+
+ 'SignedHeaders=' + signed_headers + ', ' + 'Signature=' + signature
257+
258+
request_url = 'https://%s%s' % (host, canonical_uri)
259+
headers['Authorization'] = authorization_header
260+
if content_type is not None:
261+
headers['content-type'] = content_type
262+
return request_url, headers

boto3utils/stepfunctions.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import boto3
2+
import json
3+
import logging
4+
5+
from botocore.client import Config
6+
from botocore.vendored.requests.exceptions import ReadTimeout
7+
from traceback import format_exc
8+
9+
logger = logging.getLogger(__name__)
10+
11+
12+
# module level client
13+
sfn = boto3.client('stepfunctions', config=Config(read_timeout=70))
14+
15+
16+
def run_activity(process, arn, **kwargs):
17+
""" Run an activity around the process function provided """
18+
while True:
19+
logger.info('Querying for task')
20+
try:
21+
task = sfn.get_activity_task(activityArn=arn)
22+
except ReadTimeout:
23+
logger.warning('Activity read timed out')
24+
continue
25+
token = task.get('taskToken', None)
26+
if token is None:
27+
continue
28+
logger.debug('taskToken: %s' % token)
29+
try:
30+
payload = task.get('input', '{}')
31+
logger.info('Payload: %s' % payload)
32+
# run process function with payload as kwargs
33+
output = process(json.loads(payload))
34+
# Send task success
35+
sfn.send_task_success(taskToken=token, output=json.dumps(output))
36+
except Exception as e:
37+
err = str(e)
38+
tb = format_exc()
39+
logger.error("Exception when running task: %s - %s" % (err, json.dumps(tb)))
40+
err = (err[252] + ' ...') if len(err) > 252 else err
41+
sfn.send_task_failure(taskToken=token, error=str(err), cause=tb)

boto3utils/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.0.2'
1+
__version__ = '0.0.3'

test/test_s3.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
import pytest
44

5+
from datetime import datetime
56
# this must be imported before any boto3 module
67
from moto import mock_s3
78

@@ -77,4 +78,19 @@ def test_read_json(s3mock):
7778
def test_find(s3mock):
7879
urls = list(s3.find('s3://%s/test' % BUCKET))
7980
assert(len(urls) > 0)
80-
assert('test.json' in urls)
81+
assert('test.json' in urls)
82+
83+
def test_latest_inventory():
84+
url = 's3://sentinel-inventory/sentinel-s1-l1c/sentinel-s1-l1c-inventory'
85+
suffix = 'productInfo.json'
86+
for f in s3.latest_inventory(url, suffix=suffix):
87+
dt = datetime.strptime(f['LastModifiedDate'], "%Y-%m-%dT%H:%M:%S.%fZ")
88+
hours = (datetime.today() - dt).seconds // 3600
89+
assert(hours < 24)
90+
assert(f['url'].endswith(suffix))
91+
break
92+
for f in s3.latest_inventory(url):
93+
dt = datetime.strptime(f['LastModifiedDate'], "%Y-%m-%dT%H:%M:%S.%fZ")
94+
hours = (datetime.today() - dt).seconds // 3600
95+
assert(hours < 24)
96+
break

test/test_stepfunctions.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#import boto3
2+
#import os
3+
#import pytest
4+
5+
# this must be imported before any boto3 module
6+
#from moto import mock_sfn
7+
8+
#from boto3utils import stepfunctions as sfn
9+
#from shutil import rmtree
10+
11+
12+
#@pytest.fixture(scope='function')
13+
#def sfn_mock():
14+
# with mock_sfn():
15+
# client = boto3.client('stepfunctions')
16+
# yield client
17+
18+
19+
#def test_run_activity(sfn):
20+
# sfn.run_activity(sfn, arn)

0 commit comments

Comments
 (0)