unt-libraries · ldko · Oct 14, 2019 · Oct 9, 2019 · Oct 9, 2019 · Oct 10, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,9 @@
+1.1.0
+------
+
+* Add support for Webrecorder.io. [#34](https://github.com/unt-libraries/py-wasapi-client/pull/34)
+
+1.0.0
+------
+
+* Initial release.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # py-wasapi-client [![Build Status](https://travis-ci.org/unt-libraries/py-wasapi-client.svg)](https://travis-ci.org/unt-libraries/py-wasapi-client)
-A client for the [Archive-It] WASAPI Data Transfer API. This client
-is being developed according to the [ait-specification](https://github.com/WASAPI-Community/data-transfer-apis/tree/master/ait-specification).
+A client for the WASAPI Data Transfer API. Initially developed according to the
+[Archive-It specification](https://github.com/WASAPI-Community/data-transfer-apis/tree/master/ait-specification), the client now additionally supports [Webrecorder.io](https://webrecorder.io/).
 
 ## Requirements
 
@@ -96,8 +96,8 @@ query parameters:
 
 ## Configuration
 
-When you are using the tool to query an Archive-It WASAPI endpoint,
-you will need to supply a username and password for the API. You have
+When you are using the tool to query an Archive-It or Webrecorder WASAPI
+endpoint, you will need to supply a username and password for the API. You have
 three options to provide these credentials.
 
 1. Supply a username with `-u`, and you will be prompted for a password.
@@ -196,6 +196,13 @@ wasapi-client do the downloading, use the --urls flag.
 $ wasapi-client --profile unt --crawl 256119 --urls
 ```
 
+To use the client with Webrecorder (not all query parameters may be supported),
+supply the base URL with -b.
+
+```
+$ wasapi-client -b https://webrecorder.io/api/v1/download/webdata --profile webrecorder --collection my_collection -d warcs
+```
+
 ## Run the Tests
 
 ```

diff --git a/tests/test_wasapi_client.py b/tests/test_wasapi_client.py
@@ -390,6 +390,23 @@ def test_download_check_exists_true(self):
         # Check that no get request was made.
         assert not mock_session.get.called
 
+    def test_download_uses_pre_signed_url(self):
+        """Test that an s3 URL uses requests.get, not a session."""
+        locations = ['https://data.s3.amazonaws.com/warcs/blah.warc.gz?Signature=xyz',
+                     'http://loc2/blah.warc.gz']
+        filename = 'blah.warc.gz'
+        checksums = {'md5': '72b484a2610cb54ec22e48c8104ba3bd'}
+        data_file = wc.DataFile(locations, filename, checksums, 123456)
+        mock_200 = MockResponse200('')
+
+        with patch('requests.get', return_value=mock_200) as mock_get, \
+                patch('wasapi_client.write_file') as mock_write_file:
+            wc.download_file(data_file, requests.Session(), filename)
+
+        # Check we attempted one download via requests.get and wrote the file.
+        mock_get.assert_called_once_with(locations[0], stream=True)
+        mock_write_file.assert_called_once_with(mock_200, filename)
+
 
 class Test_check_exists:
     def test_check_exists_return_true(self):
@@ -467,6 +484,47 @@ def test_verify_file_one_supported_algorithm(self, mock_calc_sum):
         mock_logger.debug.assert_called_once_with('abc is unsupported')
         mock_logger.info.assert_called_once_with('Checksum success at: dummy/path')
 
+    @patch('wasapi_client.calculate_sum')
+    def test_verify_file_s3etag_algorithm_regular_md5(self, mock_calc_sum):
+        checksum = '72b484a2610cb54ec22e48c8104ba3bd'
+        checksums = {'s3etag': checksum}
+        mock_calc_sum.return_value = checksum
+        assert wc.verify_file(checksums, 'dummy/path')
+        # Verify the hash_function used was md5.
+        mock_calc_sum.assert_called_once_with(hashlib.md5, 'dummy/path', wc.READ_LIMIT)
+
+    @patch('wasapi_client.calculate_sum')
+    def test_verify_file_s3etag_algorithm_double_md5(self, mock_calc_sum):
+        checksum = 'ceb8853ddc5086cc4ab9e149f8f09c88-2'
+        checksums = {'s3etag': checksum}
+        mock_calc_sum.return_value = checksum
+        assert wc.verify_file(checksums, 'dummy/path')
+        # Verify s3etag value containing a '-' uses S3DoubleMD5 and custom read_limit.
+        mock_calc_sum.assert_called_once_with(wc.S3DoubleMD5, 'dummy/path', 1024*1024*8)
+
+
+class Test_S3DoubleMD5:
+    def test_S3DoubleMD5_single_md5(self):
+        content = b'We are updating this once.'
+        s3md5 = wc.S3DoubleMD5()
+        s3md5.update(content)
+        # Calling update once means length of s3md5.md5s is 1, and
+        # hexdigest is same as for regular md5.
+        assert len(s3md5.md5s) == 1
+        assert s3md5.hexdigest() == hashlib.md5(content).hexdigest()
+
+    def test_S3DoubleMD5_double_md5(self):
+        content = b'We are updating this once.\nTwice.\nAnd three times.'
+        s3md5 = wc.S3DoubleMD5()
+        # Cause update to be called three times.
+        for line in content.split(b'\n'):
+            s3md5.update(line)
+        # S3DoubleMD5 hexdigest should be the hexdigest of the concatenation
+        # of the digests of the 3 items in s3md5.md5s and a '-3'
+        # for the number of digests that were concatenated.
+        assert len(s3md5.md5s) == 3
+        assert s3md5.hexdigest() == '8e73850eb35bebe8ebd2896dd9032e48-3'
+
 
 class Test_calculate_sum:
     @pytest.mark.skipif(sys.version_info < (3, 4, 4), reason=('bug via mock_open '

diff --git a/wasapi_client.py b/wasapi_client.py
@@ -11,6 +11,7 @@
 import multiprocessing
 import os
 import requests
+import re
 import sys
 from collections import defaultdict
 try:
@@ -29,6 +30,8 @@ class JSONDecodeError(ValueError):
 
 PROFILE_PATH = os.path.join(os.path.expanduser('~'), '.wasapi-client')
 
+PRE_SIGNED_REGEX = [re.compile(r'https://.*\.s3.amazonaws\.com/.*[?].*Signature=.+')]
+
 
 def start_listener_logging(log_q, path=''):
     formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s')
@@ -219,8 +222,17 @@ def download_file(data_file, session, output_path):
         data_file.verified = True
         return data_file
     for location in data_file.locations:
+
+        # if location matches a 'pre-signed' url regex pattern,
+        # skip auth for this location
+        for rx in PRE_SIGNED_REGEX:
+            if rx.match(location):
+                sesh = requests
+            else:
+                sesh = session
+
         try:
-            response = session.get(location, stream=True)
+            response = sesh.get(location, stream=True)
         except requests.exceptions.RequestException as err:
             # This could be a remote disconnect, read timeout, connection timeout,
             # temporary name resolution issue...
@@ -270,12 +282,25 @@ def verify_file(checksums, file_path):
     or failure determines if the file is valid.
     """
     for algorithm, value in checksums.items():
+        read_limit = READ_LIMIT
         hash_function = getattr(hashlib, algorithm, None)
+        if not hash_function and algorithm == 's3etag':
+            # if etag does not contain a '-', then its just a regular md5
+            if '-' not in value:
+                hash_function = hashlib.md5
+
+            # otherwise, its likely a 'double-md5'
+            # see: https://zihao.me/post/calculating-etag-for-aws-s3-objects/
+            else:
+                hash_function = S3DoubleMD5
+                # expected chunk size for S3 md5 computation
+                read_limit = 1024 * 1024 * 8
+
         if not hash_function:
             # The hash algorithm provided is not supported by hashlib.
             LOGGER.debug('{} is unsupported'.format(algorithm))
             continue
-        digest = calculate_sum(hash_function, file_path)
+        digest = calculate_sum(hash_function, file_path, read_limit)
         if digest == value:
             LOGGER.info('Checksum success at: {}'.format(file_path))
             return True
@@ -289,14 +314,35 @@ def verify_file(checksums, file_path):
     return False
 
 
-def calculate_sum(hash_function, file_path):
+class S3DoubleMD5:
+    """Implements double-md5 computation as suggested by:
+
+    https://zihao.me/post/calculating-etag-for-aws-s3-objects/
+    """
+
+    def __init__(self):
+        self.md5s = []
+
+    def update(self, buff):
+        self.md5s.append(hashlib.md5(buff))
+
+    def hexdigest(self):
+        if len(self.md5s) == 1:
+            return self.md5s[0].hexdigest()
+
+        digests = b''.join(m.digest() for m in self.md5s)
+        digests_md5 = hashlib.md5(digests)
+        return '{}-{}'.format(digests_md5.hexdigest(), len(self.md5s))
+
+
+def calculate_sum(hash_function, file_path, read_limit=READ_LIMIT):
     """Return the checksum of the given file."""
     hasher = hash_function()
     with open(file_path, 'rb') as rff:
-        r = rff.read(READ_LIMIT)
+        r = rff.read(read_limit)
         while r:
             hasher.update(r)
-            r = rff.read(READ_LIMIT)
+            r = rff.read(read_limit)
     return hasher.hexdigest()