Skip to content

Commit

Permalink
Parse out subseconds from WARC-Date longer than six digits
Browse files Browse the repository at this point in the history
  • Loading branch information
machawk1 committed Jun 19, 2020
1 parent 8f65140 commit b76135a
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 4 deletions.
4 changes: 2 additions & 2 deletions ipwb/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from six import PY2
from six import PY3

from .util import iso8601ToDigits14
from .util import iso8601ToDigits
from . import util as ipwbUtils

# from warcio.archiveiterator import ArchiveIterator
Expand Down Expand Up @@ -287,7 +287,7 @@ def getCDXJLinesFromFile(warcPath, **encCompOpts):
originaluri_surted = \
surt.surt(originaluri,
path_strip_trailing_slash_unless_empty=False)
timestamp = iso8601ToDigits14(
timestamp = iso8601ToDigits(
record.rec_headers.get_header('WARC-Date'))
mime = record.http_headers.get_header('content-type')
obj = {
Expand Down
24 changes: 22 additions & 2 deletions ipwb/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,26 @@ def __str__(self):
return f'WARC-Date {self.target_string} not parseable.'


def iso8601ToDigits14(warcDatetimeString):
def is_warc11_subsecond_edgecase(dt):
"""
Check if sub-second value is included in WARC-Date to comply with
WARC/1.1 specification allowing 1-9 digits per W3C DTF
"""
dt_f = f'{dt[:26]}{dt[-1:]}'
try:
dts = datetime.datetime.strptime(dt_f, '%Y-%m-%dT%H:%M:%S.%fZ')
within_subsecond_len_threshold = len(dt) <= 30
all_subsec_chars_are_digits = dt[20:-1].isdigit()
if within_subsecond_len_threshold and all_subsec_chars_are_digits:
return '{}{}'.format(
dts.strftime('%Y%m%d%H%M%S'), dt[20:-1])
else:
raise InvalidWARCDateException(target_string=dt)
except ValueError as ve:
raise InvalidWARCDateException(target_string=dt)


def iso8601ToDigits(warcDatetimeString):
setLocale()

iso8601_datestrings = ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%MZ",
Expand All @@ -190,7 +209,8 @@ def iso8601ToDigits14(warcDatetimeString):
# TODO: Account for conversion if TZ other than GMT not specified

if d is None:
raise InvalidWARCDateException(target_string=warcDatetimeString)
# Check edge case of 1-9 sub-seconds
return is_warc11_subsecond_edgecase(warcDatetimeString)

return d.strftime('%Y%m%d%H%M%S')

Expand Down

5 comments on commit b76135a

@machawk1
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be improved but works to convert a 7-9 digit WARC-Date to digits. Moved away from saying "14 digits" since more precision will be had with the addition of milliseconds.

% ipwb index samples/warcs/variableSizedDates.warc
!context ["http://tools.ietf.org/html/rfc7089"]
!meta {"generator": "InterPlanetary Wayback v.0.2020.06.18.1933", "created_at": "2020-06-19T12:55:39.225312"}
us,memento)/ 20140101000000 {"locator": "urn:ipfs/QmNQX5gEjbEPModBHXb6w4EWveLkZ57uEC9Kzh8bho7QmL/QmX4gE6SdJK8v67XikqQFJrac4xaqB5kwsgona2nH9hZwm", "status_code": "200", "mime_type": "text/html", "original_uri": "http://memento.us/"}
us,memento)/ 20140210000001 {"locator": "urn:ipfs/QmNQX5gEjbEPModBHXb6w4EWveLkZ57uEC9Kzh8bho7QmL/QmXQB6e2aB7VRaA4CK5H33sTfVC6GxNd1JtSgCaWVuUbfj", "status_code": "200", "mime_type": "text/html", "original_uri": "http://memento.us/"}
us,memento)/ 20140210000001 {"locator": "urn:ipfs/QmNQX5gEjbEPModBHXb6w4EWveLkZ57uEC9Kzh8bho7QmL/QmYWRfaHFcN7ygLUiiKEF6ELApMbdhv7K3zRtrz5rog83U", "status_code": "200", "mime_type": "text/html", "original_uri": "http://memento.us/"}
us,memento)/ 20140210000001000000002 {"locator": "urn:ipfs/QmNQX5gEjbEPModBHXb6w4EWveLkZ57uEC9Kzh8bho7QmL/Qmb8q1BFPws4ZNhL9MczY9tb4mWEPdV41LNuXD6oMkvzcw", "status_code": "200", "mime_type": "text/html", "original_uri": "http://memento.us/"}

Feedback, @ibnesayeed? Please note the date string in the last CDXJ line.

@ibnesayeed
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can do something simpler here, but I need to think about potential implications. As an aside, there is another opportunity to use f-string in this commit.

@machawk1
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was unsure whether the string execution in an f-string would behave correctly, so opted for format after initially using an f-string.

Alternative approaches might be to initially rely on the string length and check validity from there. Feel free to propose something else but this is working for now. I'll continue looking into replay handling these long datestrings.

@ibnesayeed
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking more in the direction of using a RegEx with certain defaults to isolate pieces and the then format the time from there.

@machawk1
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about that approach, too. I don't think there is any issue with doing it that way but some of the logic will need to be re-written.

Please # to comment.