Skip to content

Commit a8109e0

Browse files
authored
fix: send entire object checksum at "initiate a resumable upload session" (#1525)
* [Initiate a resumable upload session](https://cloud.google.com/storage/docs/performing-resumable-uploads#initiate-session) provides a way to specify entire object checksum as metadata. * This will be used by the server before finalization, if there's a mismatch between the checksum provided by client (ie this app) and the one calculated by server. The server will return `400: BadRequest`
1 parent 7ce4e0b commit a8109e0

File tree

7 files changed

+150
-18
lines changed

7 files changed

+150
-18
lines changed

google/cloud/storage/_media/_upload.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,12 @@ class ResumableUpload(UploadBase):
411411
"""
412412

413413
def __init__(
414-
self, upload_url, chunk_size, checksum="auto", headers=None, retry=DEFAULT_RETRY
414+
self,
415+
upload_url,
416+
chunk_size,
417+
checksum="auto",
418+
headers=None,
419+
retry=DEFAULT_RETRY,
415420
):
416421
super(ResumableUpload, self).__init__(upload_url, headers=headers, retry=retry)
417422
if chunk_size % UPLOAD_CHUNK_SIZE != 0:
@@ -472,7 +477,12 @@ def total_bytes(self):
472477
return self._total_bytes
473478

474479
def _prepare_initiate_request(
475-
self, stream, metadata, content_type, total_bytes=None, stream_final=True
480+
self,
481+
stream,
482+
metadata,
483+
content_type,
484+
total_bytes=None,
485+
stream_final=True,
476486
):
477487
"""Prepare the contents of HTTP request to initiate upload.
478488
@@ -955,7 +965,12 @@ class XMLMPUContainer(UploadBase):
955965
"""
956966

957967
def __init__(
958-
self, upload_url, filename, headers=None, upload_id=None, retry=DEFAULT_RETRY
968+
self,
969+
upload_url,
970+
filename,
971+
headers=None,
972+
upload_id=None,
973+
retry=DEFAULT_RETRY,
959974
):
960975
super().__init__(upload_url, headers=headers, retry=retry)
961976
self._filename = filename

google/cloud/storage/blob.py

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414

1515
# pylint: disable=too-many-lines
1616

17-
"""Create / interact with Google Cloud Storage blobs.
18-
"""
17+
"""Create / interact with Google Cloud Storage blobs."""
1918

2019
import base64
2120
import copy
@@ -142,8 +141,8 @@
142141
r"(?P<scheme>gs)://(?P<bucket_name>[a-z0-9_.-]+)/(?P<object_name>.+)"
143142
)
144143

145-
_DEFAULT_CHUNKSIZE = 104857600 # 1024 * 1024 B * 100 = 100 MB
146-
_MAX_MULTIPART_SIZE = 8388608 # 8 MB
144+
_DEFAULT_CHUNKSIZE = 104857600 # 1024 * 1024 B * 100 = 100 MiB
145+
_MAX_MULTIPART_SIZE = 8388608 # 8 MiB
147146

148147
_logger = logging.getLogger(__name__)
149148

@@ -181,6 +180,14 @@ class Blob(_PropertyMixin):
181180
:type generation: long
182181
:param generation:
183182
(Optional) If present, selects a specific revision of this object.
183+
184+
:type crc32c_checksum: str
185+
:param crc32c_checksum:
186+
(Optional) If set, the CRC32C checksum of the blob's content.
187+
CRC32c checksum, as described in RFC 4960, Appendix B; encoded using
188+
base64 in big-endian byte order. See
189+
Apenndix B: https://datatracker.ietf.org/doc/html/rfc4960#appendix-B
190+
base64: https://datatracker.ietf.org/doc/html/rfc4648#section-4
184191
"""
185192

186193
_chunk_size = None # Default value for each instance.
@@ -214,6 +221,7 @@ def __init__(
214221
encryption_key=None,
215222
kms_key_name=None,
216223
generation=None,
224+
crc32c_checksum=None,
217225
):
218226
"""
219227
property :attr:`name`
@@ -237,6 +245,9 @@ def __init__(
237245
if generation is not None:
238246
self._properties["generation"] = generation
239247

248+
if crc32c_checksum is not None:
249+
self._properties["crc32c"] = crc32c_checksum
250+
240251
@property
241252
def bucket(self):
242253
"""Bucket which contains the object.
@@ -1643,7 +1654,9 @@ def download_as_string(
16431654
:raises: :class:`google.cloud.exceptions.NotFound`
16441655
"""
16451656
warnings.warn(
1646-
_DOWNLOAD_AS_STRING_DEPRECATED, PendingDeprecationWarning, stacklevel=2
1657+
_DOWNLOAD_AS_STRING_DEPRECATED,
1658+
PendingDeprecationWarning,
1659+
stacklevel=2,
16471660
)
16481661
with create_trace_span(name="Storage.Blob.downloadAsString"):
16491662
return self.download_as_bytes(
@@ -1999,12 +2012,18 @@ def _do_multipart_upload(
19992012
transport = self._get_transport(client)
20002013
if "metadata" in self._properties and "metadata" not in self._changes:
20012014
self._changes.add("metadata")
2015+
20022016
info = self._get_upload_arguments(client, content_type, command=command)
20032017
headers, object_metadata, content_type = info
20042018

2019+
if "crc32c" in self._properties:
2020+
object_metadata["crc32c"] = self._properties["crc32c"]
2021+
20052022
hostname = _get_host_name(client._connection)
20062023
base_url = _MULTIPART_URL_TEMPLATE.format(
2007-
hostname=hostname, bucket_path=self.bucket.path, api_version=_API_VERSION
2024+
hostname=hostname,
2025+
bucket_path=self.bucket.path,
2026+
api_version=_API_VERSION,
20082027
)
20092028
name_value_pairs = []
20102029

@@ -2195,9 +2214,14 @@ def _initiate_resumable_upload(
21952214
if extra_headers is not None:
21962215
headers.update(extra_headers)
21972216

2217+
if "crc32c" in self._properties:
2218+
object_metadata["crc32c"] = self._properties["crc32c"]
2219+
21982220
hostname = _get_host_name(client._connection)
21992221
base_url = _RESUMABLE_URL_TEMPLATE.format(
2200-
hostname=hostname, bucket_path=self.bucket.path, api_version=_API_VERSION
2222+
hostname=hostname,
2223+
bucket_path=self.bucket.path,
2224+
api_version=_API_VERSION,
22012225
)
22022226
name_value_pairs = []
22032227

@@ -2234,7 +2258,11 @@ def _initiate_resumable_upload(
22342258

22352259
upload_url = _add_query_parameters(base_url, name_value_pairs)
22362260
upload = ResumableUpload(
2237-
upload_url, chunk_size, headers=headers, checksum=checksum, retry=retry
2261+
upload_url,
2262+
chunk_size,
2263+
headers=headers,
2264+
checksum=checksum,
2265+
retry=retry,
22382266
)
22392267

22402268
upload.initiate(
@@ -3426,7 +3454,11 @@ def set_iam_policy(
34263454
return Policy.from_api_repr(info)
34273455

34283456
def test_iam_permissions(
3429-
self, permissions, client=None, timeout=_DEFAULT_TIMEOUT, retry=DEFAULT_RETRY
3457+
self,
3458+
permissions,
3459+
client=None,
3460+
timeout=_DEFAULT_TIMEOUT,
3461+
retry=DEFAULT_RETRY,
34303462
):
34313463
"""API call: test permissions
34323464
@@ -3693,7 +3725,10 @@ def compose(
36933725

36943726
source_objects = []
36953727
for source, source_generation in zip(sources, if_source_generation_match):
3696-
source_object = {"name": source.name, "generation": source.generation}
3728+
source_object = {
3729+
"name": source.name,
3730+
"generation": source.generation,
3731+
}
36973732

36983733
preconditions = {}
36993734
if source_generation is not None:
@@ -4154,7 +4189,10 @@ def open(
41544189
"encoding, errors and newline arguments are for text mode only"
41554190
)
41564191
return BlobWriter(
4157-
self, chunk_size=chunk_size, ignore_flush=ignore_flush, **kwargs
4192+
self,
4193+
chunk_size=chunk_size,
4194+
ignore_flush=ignore_flush,
4195+
**kwargs,
41584196
)
41594197
elif mode in ("r", "rt"):
41604198
if ignore_flush:

google/cloud/storage/bucket.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,10 @@ class LifecycleRuleSetStorageClass(dict):
386386
def __init__(self, storage_class, **kw):
387387
conditions = LifecycleRuleConditions(**kw)
388388
rule = {
389-
"action": {"type": "SetStorageClass", "storageClass": storage_class},
389+
"action": {
390+
"type": "SetStorageClass",
391+
"storageClass": storage_class,
392+
},
390393
"condition": dict(conditions),
391394
}
392395
super().__init__(rule)
@@ -846,6 +849,7 @@ def blob(
846849
encryption_key=None,
847850
kms_key_name=None,
848851
generation=None,
852+
crc32c_checksum=None,
849853
):
850854
"""Factory constructor for blob object.
851855
@@ -873,6 +877,14 @@ def blob(
873877
:param generation: (Optional) If present, selects a specific revision of
874878
this object.
875879
880+
:type crc32c_checksum: str
881+
:param crc32c_checksum:
882+
(Optional) If set, the CRC32C checksum of the blob's content.
883+
CRC32c checksum, as described in RFC 4960, Appendix B; encoded using
884+
base64 in big-endian byte order. See
885+
Apenndix B: https://datatracker.ietf.org/doc/html/rfc4960#appendix-B
886+
base64: https://datatracker.ietf.org/doc/html/rfc4648#section-4
887+
876888
:rtype: :class:`google.cloud.storage.blob.Blob`
877889
:returns: The blob object created.
878890
"""
@@ -883,6 +895,7 @@ def blob(
883895
encryption_key=encryption_key,
884896
kms_key_name=kms_key_name,
885897
generation=generation,
898+
crc32c_checksum=crc32c_checksum,
886899
)
887900

888901
def notification(
@@ -3253,7 +3266,10 @@ def configure_website(self, main_page_suffix=None, not_found_page=None):
32533266
:type not_found_page: str
32543267
:param not_found_page: The file to use when a page isn't found.
32553268
"""
3256-
data = {"mainPageSuffix": main_page_suffix, "notFoundPage": not_found_page}
3269+
data = {
3270+
"mainPageSuffix": main_page_suffix,
3271+
"notFoundPage": not_found_page,
3272+
}
32573273
self._patch_property("website", data)
32583274

32593275
def disable_website(self):
@@ -3385,7 +3401,11 @@ def set_iam_policy(
33853401
return Policy.from_api_repr(info)
33863402

33873403
def test_iam_permissions(
3388-
self, permissions, client=None, timeout=_DEFAULT_TIMEOUT, retry=DEFAULT_RETRY
3404+
self,
3405+
permissions,
3406+
client=None,
3407+
timeout=_DEFAULT_TIMEOUT,
3408+
retry=DEFAULT_RETRY,
33893409
):
33903410
"""API call: test permissions
33913411

tests/data/random_9_MiB_file

9 MB
Binary file not shown.

tests/system/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
("logo", "CloudPlatform_128px_Retina.png"),
3131
("big", "five-point-one-mb-file.zip"),
3232
("simple", "simple.txt"),
33+
("big_9MiB", "random_9_MiB_file"),
3334
]
3435
_file_data = {
3536
key: {"path": os.path.join(data_dirname, file_name)}

tests/system/test_blob.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,40 @@ def _check_blob_hash(blob, info):
3939
assert md5_hash == info["hash"]
4040

4141

42+
def test_large_file_write_from_stream_w_user_provided_checksum(
43+
shared_bucket,
44+
blobs_to_delete,
45+
file_data,
46+
service_account,
47+
):
48+
blob = shared_bucket.blob(
49+
f"LargeFile{uuid.uuid4().hex}", crc32c_checksum="20tD7w=="
50+
)
51+
52+
info = file_data["big_9MiB"]
53+
with open(info["path"], "rb") as file_obj:
54+
blob.upload_from_file(file_obj)
55+
blobs_to_delete.append(blob)
56+
57+
58+
def test_large_file_write_from_stream_w_user_provided_wrong_checksum(
59+
shared_bucket,
60+
blobs_to_delete,
61+
file_data,
62+
service_account,
63+
):
64+
blob = shared_bucket.blob(
65+
f"LargeFile{uuid.uuid4().hex}", crc32c_checksum="A0tD7w=="
66+
)
67+
68+
info = file_data["big_9MiB"]
69+
with pytest.raises(exceptions.BadRequest) as excep_info:
70+
with open(info["path"], "rb") as file_obj:
71+
blob.upload_from_file(file_obj)
72+
blobs_to_delete.append(blob)
73+
assert excep_info.value.code == 400
74+
75+
4276
def test_large_file_write_from_stream(
4377
shared_bucket,
4478
blobs_to_delete,

tests/unit/test_blob.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2762,12 +2762,21 @@ def _initiate_resumable_helper(
27622762
metadata=None,
27632763
mtls=False,
27642764
retry=None,
2765+
crc32c_checksum=None,
27652766
):
27662767
from google.cloud.storage._media.requests import ResumableUpload
27672768
from google.cloud.storage.blob import _DEFAULT_CHUNKSIZE
27682769

27692770
bucket = _Bucket(name="whammy", user_project=user_project)
2770-
blob = self._make_one("blob-name", bucket=bucket, kms_key_name=kms_key_name)
2771+
if crc32c_checksum is None:
2772+
blob = self._make_one("blob-name", bucket=bucket, kms_key_name=kms_key_name)
2773+
else:
2774+
blob = self._make_one(
2775+
"blob-name",
2776+
bucket=bucket,
2777+
kms_key_name=kms_key_name,
2778+
crc32c_checksum=crc32c_checksum,
2779+
)
27712780
if metadata:
27722781
self.assertIsNone(blob.metadata)
27732782
blob._properties["metadata"] = metadata
@@ -2919,6 +2928,10 @@ def _initiate_resumable_helper(
29192928
else:
29202929
# Check the mocks.
29212930
blob._get_writable_metadata.assert_called_once_with()
2931+
2932+
if "crc32c" in blob._properties:
2933+
object_metadata["crc32c"] = blob._properties["crc32c"]
2934+
29222935
payload = json.dumps(object_metadata).encode("utf-8")
29232936

29242937
with patch.object(
@@ -2945,6 +2958,17 @@ def _initiate_resumable_helper(
29452958
def test__initiate_resumable_upload_with_metadata(self):
29462959
self._initiate_resumable_helper(metadata={"test": "test"})
29472960

2961+
def test__initiate_resumable_upload_with_user_provided_checksum(self):
2962+
self._initiate_resumable_helper(
2963+
crc32c_checksum="this-is-a-fake-checksum-for-unit-tests",
2964+
)
2965+
2966+
def test__initiate_resumable_upload_w_metadata_and_user_provided_checksum(self):
2967+
self._initiate_resumable_helper(
2968+
crc32c_checksum="test-checksum",
2969+
metadata={"my-fav-key": "my-fav-value"},
2970+
)
2971+
29482972
def test__initiate_resumable_upload_with_custom_timeout(self):
29492973
self._initiate_resumable_helper(timeout=9.58)
29502974

0 commit comments

Comments
 (0)