Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Support for S3 download urls: presigned urls, S3 etag #34

Merged
merged 6 commits into from
Oct 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
1.1.0
------

* Add support for Webrecorder.io. [#34](https://github.com/unt-libraries/py-wasapi-client/pull/34)

1.0.0
------

* Initial release.
15 changes: 11 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# py-wasapi-client [![Build Status](https://travis-ci.org/unt-libraries/py-wasapi-client.svg)](https://travis-ci.org/unt-libraries/py-wasapi-client)
A client for the [Archive-It] WASAPI Data Transfer API. This client
is being developed according to the [ait-specification](https://github.com/WASAPI-Community/data-transfer-apis/tree/master/ait-specification).
A client for the WASAPI Data Transfer API. Initially developed according to the
[Archive-It specification](https://github.com/WASAPI-Community/data-transfer-apis/tree/master/ait-specification), the client now additionally supports [Webrecorder.io](https://webrecorder.io/).

## Requirements

Expand Down Expand Up @@ -96,8 +96,8 @@ query parameters:

## Configuration

When you are using the tool to query an Archive-It WASAPI endpoint,
you will need to supply a username and password for the API. You have
When you are using the tool to query an Archive-It or Webrecorder WASAPI
endpoint, you will need to supply a username and password for the API. You have
three options to provide these credentials.

1. Supply a username with `-u`, and you will be prompted for a password.
Expand Down Expand Up @@ -196,6 +196,13 @@ wasapi-client do the downloading, use the --urls flag.
$ wasapi-client --profile unt --crawl 256119 --urls
```

To use the client with Webrecorder (not all query parameters may be supported),
supply the base URL with -b.

```
$ wasapi-client -b https://webrecorder.io/api/v1/download/webdata --profile webrecorder --collection my_collection -d warcs
```

## Run the Tests

```
Expand Down
58 changes: 58 additions & 0 deletions tests/test_wasapi_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,23 @@ def test_download_check_exists_true(self):
# Check that no get request was made.
assert not mock_session.get.called

def test_download_uses_pre_signed_url(self):
"""Test that an s3 URL uses requests.get, not a session."""
locations = ['https://data.s3.amazonaws.com/warcs/blah.warc.gz?Signature=xyz',
'http://loc2/blah.warc.gz']
filename = 'blah.warc.gz'
checksums = {'md5': '72b484a2610cb54ec22e48c8104ba3bd'}
data_file = wc.DataFile(locations, filename, checksums, 123456)
mock_200 = MockResponse200('')

with patch('requests.get', return_value=mock_200) as mock_get, \
patch('wasapi_client.write_file') as mock_write_file:
wc.download_file(data_file, requests.Session(), filename)

# Check we attempted one download via requests.get and wrote the file.
mock_get.assert_called_once_with(locations[0], stream=True)
mock_write_file.assert_called_once_with(mock_200, filename)


class Test_check_exists:
def test_check_exists_return_true(self):
Expand Down Expand Up @@ -467,6 +484,47 @@ def test_verify_file_one_supported_algorithm(self, mock_calc_sum):
mock_logger.debug.assert_called_once_with('abc is unsupported')
mock_logger.info.assert_called_once_with('Checksum success at: dummy/path')

@patch('wasapi_client.calculate_sum')
def test_verify_file_s3etag_algorithm_regular_md5(self, mock_calc_sum):
checksum = '72b484a2610cb54ec22e48c8104ba3bd'
checksums = {'s3etag': checksum}
mock_calc_sum.return_value = checksum
assert wc.verify_file(checksums, 'dummy/path')
# Verify the hash_function used was md5.
mock_calc_sum.assert_called_once_with(hashlib.md5, 'dummy/path', wc.READ_LIMIT)

@patch('wasapi_client.calculate_sum')
def test_verify_file_s3etag_algorithm_double_md5(self, mock_calc_sum):
checksum = 'ceb8853ddc5086cc4ab9e149f8f09c88-2'
checksums = {'s3etag': checksum}
mock_calc_sum.return_value = checksum
assert wc.verify_file(checksums, 'dummy/path')
# Verify s3etag value containing a '-' uses S3DoubleMD5 and custom read_limit.
mock_calc_sum.assert_called_once_with(wc.S3DoubleMD5, 'dummy/path', 1024*1024*8)


class Test_S3DoubleMD5:
def test_S3DoubleMD5_single_md5(self):
content = b'We are updating this once.'
s3md5 = wc.S3DoubleMD5()
s3md5.update(content)
# Calling update once means length of s3md5.md5s is 1, and
# hexdigest is same as for regular md5.
assert len(s3md5.md5s) == 1
assert s3md5.hexdigest() == hashlib.md5(content).hexdigest()

def test_S3DoubleMD5_double_md5(self):
content = b'We are updating this once.\nTwice.\nAnd three times.'
s3md5 = wc.S3DoubleMD5()
# Cause update to be called three times.
for line in content.split(b'\n'):
s3md5.update(line)
# S3DoubleMD5 hexdigest should be the hexdigest of the concatenation
# of the digests of the 3 items in s3md5.md5s and a '-3'
# for the number of digests that were concatenated.
assert len(s3md5.md5s) == 3
assert s3md5.hexdigest() == '8e73850eb35bebe8ebd2896dd9032e48-3'


class Test_calculate_sum:
@pytest.mark.skipif(sys.version_info < (3, 4, 4), reason=('bug via mock_open '
Expand Down
56 changes: 51 additions & 5 deletions wasapi_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import multiprocessing
import os
import requests
import re
import sys
from collections import defaultdict
try:
Expand All @@ -29,6 +30,8 @@ class JSONDecodeError(ValueError):

PROFILE_PATH = os.path.join(os.path.expanduser('~'), '.wasapi-client')

PRE_SIGNED_REGEX = [re.compile(r'https://.*\.s3.amazonaws\.com/.*[?].*Signature=.+')]


def start_listener_logging(log_q, path=''):
formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s')
Expand Down Expand Up @@ -219,8 +222,17 @@ def download_file(data_file, session, output_path):
data_file.verified = True
return data_file
for location in data_file.locations:

# if location matches a 'pre-signed' url regex pattern,
# skip auth for this location
for rx in PRE_SIGNED_REGEX:
if rx.match(location):
sesh = requests
else:
sesh = session

try:
response = session.get(location, stream=True)
response = sesh.get(location, stream=True)
except requests.exceptions.RequestException as err:
# This could be a remote disconnect, read timeout, connection timeout,
# temporary name resolution issue...
Expand Down Expand Up @@ -270,12 +282,25 @@ def verify_file(checksums, file_path):
or failure determines if the file is valid.
"""
for algorithm, value in checksums.items():
read_limit = READ_LIMIT
hash_function = getattr(hashlib, algorithm, None)
if not hash_function and algorithm == 's3etag':
# if etag does not contain a '-', then its just a regular md5
if '-' not in value:
hash_function = hashlib.md5

# otherwise, its likely a 'double-md5'
# see: https://zihao.me/post/calculating-etag-for-aws-s3-objects/
else:
hash_function = S3DoubleMD5
# expected chunk size for S3 md5 computation
read_limit = 1024 * 1024 * 8

if not hash_function:
# The hash algorithm provided is not supported by hashlib.
LOGGER.debug('{} is unsupported'.format(algorithm))
continue
digest = calculate_sum(hash_function, file_path)
digest = calculate_sum(hash_function, file_path, read_limit)
if digest == value:
LOGGER.info('Checksum success at: {}'.format(file_path))
return True
Expand All @@ -289,14 +314,35 @@ def verify_file(checksums, file_path):
return False


def calculate_sum(hash_function, file_path):
class S3DoubleMD5:
"""Implements double-md5 computation as suggested by:

https://zihao.me/post/calculating-etag-for-aws-s3-objects/
"""

def __init__(self):
self.md5s = []

def update(self, buff):
self.md5s.append(hashlib.md5(buff))

def hexdigest(self):
if len(self.md5s) == 1:
return self.md5s[0].hexdigest()

digests = b''.join(m.digest() for m in self.md5s)
digests_md5 = hashlib.md5(digests)
return '{}-{}'.format(digests_md5.hexdigest(), len(self.md5s))


def calculate_sum(hash_function, file_path, read_limit=READ_LIMIT):
"""Return the checksum of the given file."""
hasher = hash_function()
with open(file_path, 'rb') as rff:
r = rff.read(READ_LIMIT)
r = rff.read(read_limit)
while r:
hasher.update(r)
r = rff.read(READ_LIMIT)
r = rff.read(read_limit)
return hasher.hexdigest()


Expand Down