Skip to content

Commit

Permalink
change checksum of gcp download. add retry.
Browse files Browse the repository at this point in the history
context: codecov/engineering-team#1029

As suggested in the ticket we want to explore if switching the checksum
validation to 'crc32c' (from 'md5') reduces the occurance of
`DataCorruption` errors.

I switched the function to `download_as_bytes` (see [docs](https://cloud.google.com/python/docs/reference/storage/latest/google.cloud.storage.blob.Blob#google_cloud_storage_blob_Blob_download_as_bytes)) because the existing `download_as_string` is
deprecated and doesn't provide the `cheksum` argument.

I also added a retry mechanism to retry the download once in case
of `DataCorruption`. This is based on vibes and feelings mostly,
but my line of thought is that this _might_ be a temporary issue
around some part of the system downloading a file and another part
updating it kinda concurrently. Also there are legitimate resons
that packets are lost through the network and everything.
If it is a temporary issue a secondary attempt is fairly inexpensive,
and should yield good results.

Plus we can see if this theory works by tracking the number of
issues with the `DataCorruption` error over time compared to the
number of logs for "we are retrying this".

If the number of logs is low we know the change to 'crc32c' was efficient.
If the number of logs is high but the number of exceptions is low
we know that retrying is effective.
  • Loading branch information
giovanni-guidini committed Jan 31, 2024
1 parent ab54697 commit b7e90f1
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 3 deletions.
11 changes: 8 additions & 3 deletions shared/storage/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def append_to_file(self, bucket_name, path, data):
file_contents = data
return self.write_file(bucket_name, path, file_contents)

def read_file(self, bucket_name, path, file_obj=None):
def read_file(self, bucket_name, path, file_obj=None, *, retry=0):
"""Reads the content of a file
Args:
Expand All @@ -134,11 +134,16 @@ def read_file(self, bucket_name, path, file_obj=None):
blob.content_encoding = "gzip"
blob.patch()
if file_obj is None:
return blob.download_as_string()
return blob.download_as_bytes(checksum="crc32c")
else:
blob.download_to_file(file_obj)
blob.download_to_file(file_obj, checksum="crc32c")
except google.cloud.exceptions.NotFound:
raise FileNotInStorageError(f"File {path} does not exist in {bucket_name}")
except google.resumable_media.common.DataCorruption:
if retry == 0:
log.info("Download checksum failed. Trying again")
return self.read_file(bucket_name, path, file_obj, retry=1)
raise

def delete_file(self, bucket_name, path):
"""Deletes a single file from the storage (what happens if the file doesnt exist?)
Expand Down
38 changes: 38 additions & 0 deletions tests/unit/storage/test_gcp.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import gzip
import io
import tempfile
from unittest.mock import MagicMock, patch

import pytest
from google.cloud import storage as google_storage
from google.resumable_media.common import DataCorruption

from shared.storage.exceptions import BucketAlreadyExistsError, FileNotInStorageError
from shared.storage.gcp import GCPStorageService
Expand Down Expand Up @@ -235,3 +237,39 @@ def test_list_folder_contents(self, request, codecov_vcr):
assert sorted(expected_result_2, key=lambda x: x["size"]) == sorted(
results_2, key=lambda x: x["size"]
)

@patch("shared.storage.gcp.storage")
def test_read_file_retry_success(self, mock_storage):

storage = GCPStorageService(gcp_config)
mock_storage.Client.assert_called()
mock_blob = MagicMock(
name="fake_blob",
download_as_bytes=MagicMock(
side_effect=[
DataCorruption(response="checksum match failed"),
b"contents",
]
),
)
mock_storage.Blob.return_value = mock_blob
response = storage.read_file("root_bucket", "path/to/blob", None)
assert response == b"contents"

@patch("shared.storage.gcp.storage")
def test_read_file_retry_fail_twice(self, mock_storage):

storage = GCPStorageService(gcp_config)
mock_storage.Client.assert_called()
mock_blob = MagicMock(
name="fake_blob",
download_as_bytes=MagicMock(
side_effect=[
DataCorruption(response="checksum match failed"),
DataCorruption(response="checksum match failed"),
]
),
)
mock_storage.Blob.return_value = mock_blob
with pytest.raises(DataCorruption):
response = storage.read_file("root_bucket", "path/to/blob", None)

0 comments on commit b7e90f1

Please # to comment.