add only_if_modified param to GSUtils.upload_file() Needed so that the file history within gs://chromium-skia-skp-summaries will indicate when results changed. BUG=skia:1942 R=borenet@google.com Review URL: https://codereview.chromium.org/411723002
diff --git a/py/utils/gs_utils.py b/py/utils/gs_utils.py index d49b3fe..1d39ef7 100644 --- a/py/utils/gs_utils.py +++ b/py/utils/gs_utils.py
@@ -18,6 +18,7 @@ # System-level imports import errno +import hashlib import os import posixpath import re @@ -139,27 +140,51 @@ path: full path (Posix-style) of the file within the bucket to delete """ b = self._connect_to_bucket(bucket_name=bucket) - item = Key(b) - item.key = path + key = Key(b) + key.name = path try: - item.delete() + key.delete() except BotoServerError, e: e.body = (repr(e.body) + ' while deleting bucket=%s, path=%s' % (bucket, path)) raise - def upload_file(self, source_path, dest_bucket, dest_path, - predefined_acl=None, fine_grained_acl_list=None): - """Upload contents of a local file to Google Storage. + def get_last_modified_time(self, bucket, path): + """Gets the timestamp of when this file was last modified. - TODO(epoger): Add the only_if_modified param provided by upload_file() in - https://github.com/google/skia-buildbot/blob/master/slave/skia_slave_scripts/utils/old_gs_utils.py , - so we can replace that function with this one. + Params: + bucket: GS bucket in which to look for the file + path: full path (Posix-style) of the file within the bucket to check + + Returns the last modified time, as a freeform string. If the file was not + found, returns None. + """ + b = self._connect_to_bucket(bucket_name=bucket) + try: + key = b.get_key(key_name=path) + if not key: + return None + return key.last_modified + except BotoServerError, e: + e.body = (repr(e.body) + + ' while getting attributes of bucket=%s, path=%s' % ( + bucket, path)) + raise + + def upload_file(self, source_path, dest_bucket, dest_path, + only_if_modified=False, predefined_acl=None, + fine_grained_acl_list=None): + """Upload contents of a local file to Google Storage. params: source_path: full path (local-OS-style) on local disk to read from dest_bucket: GCS bucket to copy the file to dest_path: full path (Posix-style) within that bucket + only_if_modified: if True, only upload the file if it would actually + change the content on Google Storage (uploads the file if dest_path + does not exist, or if it exists but has different contents than + source_path). Note that this may take longer than just uploading the + file without checking first, due to extra round-trips! predefined_acl: which predefined ACL to apply to the file on Google Storage; must be one of the PredefinedACL values defined above. If None, inherits dest_bucket's default object ACL. @@ -170,22 +195,32 @@ or None if predefined_acl is sufficient """ b = self._connect_to_bucket(bucket_name=dest_bucket) - item = Key(b) - item.key = dest_path + + if only_if_modified: + old_key = b.get_key(key_name=dest_path) + if old_key: + local_md5 = '"%s"' % _get_local_md5(path=source_path) + if local_md5 == old_key.etag: + print 'Skipping upload of unmodified file %s : %s' % ( + source_path, local_md5) + return + + key = Key(b) + key.name = dest_path try: - item.set_contents_from_filename(filename=source_path, - policy=predefined_acl) + key.set_contents_from_filename(filename=source_path, + policy=predefined_acl) except BotoServerError, e: e.body = (repr(e.body) + ' while uploading source_path=%s to bucket=%s, path=%s' % ( - source_path, dest_bucket, item.key)) + source_path, dest_bucket, key.name)) raise # TODO(epoger): This may be inefficient, because it calls # _connect_to_bucket() again. Depending on how expensive that # call is, we may want to optimize this. for (id_type, id_value, permission) in fine_grained_acl_list or []: self.set_acl( - bucket=dest_bucket, path=item.key, + bucket=dest_bucket, path=key.name, id_type=id_type, id_value=id_value, permission=permission) def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, @@ -237,10 +272,10 @@ predefined_acl=predefined_acl, fine_grained_acl_list=fine_grained_acl_list) else: - item = Key(b) - item.key = remote_path + key = Key(b) + key.name = remote_path try: - item.set_contents_from_filename( + key.set_contents_from_filename( filename=local_path, policy=predefined_acl) except BotoServerError, e: e.body = (repr(e.body) + @@ -267,13 +302,13 @@ needed to create dest_path """ b = self._connect_to_bucket(bucket_name=source_bucket) - item = Key(b) - item.key = source_path + key = Key(b) + key.name = source_path if create_subdirs_if_needed: _makedirs_if_needed(os.path.dirname(dest_path)) with open(dest_path, 'w') as f: try: - item.get_contents_to_file(fp=f) + key.get_contents_to_file(fp=f) except BotoServerError, e: e.body = (repr(e.body) + ' while downloading bucket=%s, path=%s to local_path=%s' % ( @@ -302,16 +337,16 @@ bucket=source_bucket, subdir=source_dir) for filename in files: - item = Key(b) - item.key = posixpath.join(source_dir, filename) + key = Key(b) + key.name = posixpath.join(source_dir, filename) dest_path = os.path.join(dest_dir, filename) with open(dest_path, 'w') as f: try: - item.get_contents_to_file(fp=f) + key.get_contents_to_file(fp=f) except BotoServerError, e: e.body = (repr(e.body) + ' while downloading bucket=%s, path=%s to local_path=%s' % ( - source_bucket, item.key, dest_path)) + source_bucket, key.name, dest_path)) raise for dirname in dirs: @@ -431,13 +466,13 @@ prefix_length = len(prefix) if prefix else 0 b = self._connect_to_bucket(bucket_name=bucket) - lister = BucketListResultSet(bucket=b, prefix=prefix, delimiter='/') + items = BucketListResultSet(bucket=b, prefix=prefix, delimiter='/') dirs = [] files = [] - for item in lister: + for item in items: t = type(item) if t is Key: - files.append(item.key[prefix_length:]) + files.append(item.name[prefix_length:]) elif t is Prefix: dirs.append(item.name[prefix_length:-1]) return (dirs, files) @@ -500,3 +535,14 @@ except OSError as e: if e.errno != errno.EEXIST: raise + + +def _get_local_md5(path): + """Returns the MD5 hash of a file on local disk.""" + hasher = hashlib.md5() + with open(path, 'rb') as f: + while True: + data = f.read(64*1024) + if not data: + return hasher.hexdigest() + hasher.update(data)
diff --git a/py/utils/gs_utils_manualtest.py b/py/utils/gs_utils_manualtest.py index db70267..a5258d0 100755 --- a/py/utils/gs_utils_manualtest.py +++ b/py/utils/gs_utils_manualtest.py
@@ -12,6 +12,7 @@ import shutil import sys import tempfile +import time # Local imports. import gs_utils @@ -44,6 +45,63 @@ gs.list_bucket_contents(bucket=TEST_BUCKET, subdir=None) +def _test_only_if_modified(): + """Test only_if_modified param within upload_file().""" + gs = _get_authenticated_gs_handle() + filename = 'filename' + remote_dir = _get_unique_posix_dir() + dest_path = posixpath.join(remote_dir, filename) + local_dir = tempfile.mkdtemp() + try: + # Create a file on local disk, and upload it for the first time. + local_path = os.path.join(local_dir, filename) + with open(local_path, 'w') as f: + f.write('original contents') + gs.upload_file(source_path=local_path, dest_bucket=TEST_BUCKET, + dest_path=dest_path, only_if_modified=True) + try: + # Re-upload the same file one second later, with only_if_modified=False; + # the timestamp should change. + old_timestamp = gs.get_last_modified_time( + bucket=TEST_BUCKET, path=dest_path) + time.sleep(2) + gs.upload_file(source_path=local_path, dest_bucket=TEST_BUCKET, + dest_path=dest_path, only_if_modified=False) + new_timestamp = gs.get_last_modified_time( + bucket=TEST_BUCKET, path=dest_path) + assert old_timestamp != new_timestamp, '%s != %s' % ( + old_timestamp, new_timestamp) + + # Re-upload the same file one second later, with only_if_modified=True; + # the timestamp should NOT change. + old_timestamp = new_timestamp + time.sleep(2) + gs.upload_file(source_path=local_path, dest_bucket=TEST_BUCKET, + dest_path=dest_path, only_if_modified=True) + new_timestamp = gs.get_last_modified_time( + bucket=TEST_BUCKET, path=dest_path) + assert old_timestamp == new_timestamp, '%s == %s' % ( + old_timestamp, new_timestamp) + + # MODIFY and re-upload the file one second later, with + # only_if_modified=True; the timestamp SHOULD change. + old_timestamp = new_timestamp + with open(local_path, 'w') as f: + f.write('modified contents') + time.sleep(2) + gs.upload_file(source_path=local_path, dest_bucket=TEST_BUCKET, + dest_path=dest_path, only_if_modified=True) + new_timestamp = gs.get_last_modified_time( + bucket=TEST_BUCKET, path=dest_path) + assert old_timestamp != new_timestamp, '%s != %s' % ( + old_timestamp, new_timestamp) + finally: + # Clean up the remote_dir. + gs.delete_file(bucket=TEST_BUCKET, path=dest_path) + finally: + # Clean up the local dir. + shutil.rmtree(local_dir) + def _test_authenticated_round_trip(): gs = _get_authenticated_gs_handle() remote_dir = _get_unique_posix_dir() @@ -206,6 +264,7 @@ if __name__ == '__main__': + _test_only_if_modified() _test_public_read() _test_authenticated_round_trip() _test_dir_upload_and_download()