From b76135a73a17afd1bb6f1ebd9206ce7265ad90ff Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Fri, 19 Jun 2020 12:53:37 -0400 Subject: [PATCH] Parse out subseconds from WARC-Date longer than six digits --- ipwb/indexer.py | 4 ++-- ipwb/util.py | 24 ++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/ipwb/indexer.py b/ipwb/indexer.py index ca9d8b41..65efbdc5 100755 --- a/ipwb/indexer.py +++ b/ipwb/indexer.py @@ -34,7 +34,7 @@ from six import PY2 from six import PY3 -from .util import iso8601ToDigits14 +from .util import iso8601ToDigits from . import util as ipwbUtils # from warcio.archiveiterator import ArchiveIterator @@ -287,7 +287,7 @@ def getCDXJLinesFromFile(warcPath, **encCompOpts): originaluri_surted = \ surt.surt(originaluri, path_strip_trailing_slash_unless_empty=False) - timestamp = iso8601ToDigits14( + timestamp = iso8601ToDigits( record.rec_headers.get_header('WARC-Date')) mime = record.http_headers.get_header('content-type') obj = { diff --git a/ipwb/util.py b/ipwb/util.py index 63d17569..70911b42 100644 --- a/ipwb/util.py +++ b/ipwb/util.py @@ -168,7 +168,26 @@ def __str__(self): return f'WARC-Date {self.target_string} not parseable.' -def iso8601ToDigits14(warcDatetimeString): +def is_warc11_subsecond_edgecase(dt): + """ + Check if sub-second value is included in WARC-Date to comply with + WARC/1.1 specification allowing 1-9 digits per W3C DTF + """ + dt_f = f'{dt[:26]}{dt[-1:]}' + try: + dts = datetime.datetime.strptime(dt_f, '%Y-%m-%dT%H:%M:%S.%fZ') + within_subsecond_len_threshold = len(dt) <= 30 + all_subsec_chars_are_digits = dt[20:-1].isdigit() + if within_subsecond_len_threshold and all_subsec_chars_are_digits: + return '{}{}'.format( + dts.strftime('%Y%m%d%H%M%S'), dt[20:-1]) + else: + raise InvalidWARCDateException(target_string=dt) + except ValueError as ve: + raise InvalidWARCDateException(target_string=dt) + + +def iso8601ToDigits(warcDatetimeString): setLocale() iso8601_datestrings = ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%MZ", @@ -190,7 +209,8 @@ def iso8601ToDigits14(warcDatetimeString): # TODO: Account for conversion if TZ other than GMT not specified if d is None: - raise InvalidWARCDateException(target_string=warcDatetimeString) + # Check edge case of 1-9 sub-seconds + return is_warc11_subsecond_edgecase(warcDatetimeString) return d.strftime('%Y%m%d%H%M%S')