add only_if_modified param to GSUtils.upload_file()
Needed so that the file history within gs://chromium-skia-skp-summaries will indicate when results changed.
BUG=skia:1942
R=borenet@google.com
Review URL: https://codereview.chromium.org/411723002
diff --git a/py/utils/gs_utils.py b/py/utils/gs_utils.py
index d49b3fe..1d39ef7 100644
--- a/py/utils/gs_utils.py
+++ b/py/utils/gs_utils.py
@@ -18,6 +18,7 @@
# System-level imports
import errno
+import hashlib
import os
import posixpath
import re
@@ -139,27 +140,51 @@
path: full path (Posix-style) of the file within the bucket to delete
"""
b = self._connect_to_bucket(bucket_name=bucket)
- item = Key(b)
- item.key = path
+ key = Key(b)
+ key.name = path
try:
- item.delete()
+ key.delete()
except BotoServerError, e:
e.body = (repr(e.body) +
' while deleting bucket=%s, path=%s' % (bucket, path))
raise
- def upload_file(self, source_path, dest_bucket, dest_path,
- predefined_acl=None, fine_grained_acl_list=None):
- """Upload contents of a local file to Google Storage.
+ def get_last_modified_time(self, bucket, path):
+ """Gets the timestamp of when this file was last modified.
- TODO(epoger): Add the only_if_modified param provided by upload_file() in
- https://github.com/google/skia-buildbot/blob/master/slave/skia_slave_scripts/utils/old_gs_utils.py ,
- so we can replace that function with this one.
+ Params:
+ bucket: GS bucket in which to look for the file
+ path: full path (Posix-style) of the file within the bucket to check
+
+ Returns the last modified time, as a freeform string. If the file was not
+ found, returns None.
+ """
+ b = self._connect_to_bucket(bucket_name=bucket)
+ try:
+ key = b.get_key(key_name=path)
+ if not key:
+ return None
+ return key.last_modified
+ except BotoServerError, e:
+ e.body = (repr(e.body) +
+ ' while getting attributes of bucket=%s, path=%s' % (
+ bucket, path))
+ raise
+
+ def upload_file(self, source_path, dest_bucket, dest_path,
+ only_if_modified=False, predefined_acl=None,
+ fine_grained_acl_list=None):
+ """Upload contents of a local file to Google Storage.
params:
source_path: full path (local-OS-style) on local disk to read from
dest_bucket: GCS bucket to copy the file to
dest_path: full path (Posix-style) within that bucket
+ only_if_modified: if True, only upload the file if it would actually
+ change the content on Google Storage (uploads the file if dest_path
+ does not exist, or if it exists but has different contents than
+ source_path). Note that this may take longer than just uploading the
+ file without checking first, due to extra round-trips!
predefined_acl: which predefined ACL to apply to the file on Google
Storage; must be one of the PredefinedACL values defined above.
If None, inherits dest_bucket's default object ACL.
@@ -170,22 +195,32 @@
or None if predefined_acl is sufficient
"""
b = self._connect_to_bucket(bucket_name=dest_bucket)
- item = Key(b)
- item.key = dest_path
+
+ if only_if_modified:
+ old_key = b.get_key(key_name=dest_path)
+ if old_key:
+ local_md5 = '"%s"' % _get_local_md5(path=source_path)
+ if local_md5 == old_key.etag:
+ print 'Skipping upload of unmodified file %s : %s' % (
+ source_path, local_md5)
+ return
+
+ key = Key(b)
+ key.name = dest_path
try:
- item.set_contents_from_filename(filename=source_path,
- policy=predefined_acl)
+ key.set_contents_from_filename(filename=source_path,
+ policy=predefined_acl)
except BotoServerError, e:
e.body = (repr(e.body) +
' while uploading source_path=%s to bucket=%s, path=%s' % (
- source_path, dest_bucket, item.key))
+ source_path, dest_bucket, key.name))
raise
# TODO(epoger): This may be inefficient, because it calls
# _connect_to_bucket() again. Depending on how expensive that
# call is, we may want to optimize this.
for (id_type, id_value, permission) in fine_grained_acl_list or []:
self.set_acl(
- bucket=dest_bucket, path=item.key,
+ bucket=dest_bucket, path=key.name,
id_type=id_type, id_value=id_value, permission=permission)
def upload_dir_contents(self, source_dir, dest_bucket, dest_dir,
@@ -237,10 +272,10 @@
predefined_acl=predefined_acl,
fine_grained_acl_list=fine_grained_acl_list)
else:
- item = Key(b)
- item.key = remote_path
+ key = Key(b)
+ key.name = remote_path
try:
- item.set_contents_from_filename(
+ key.set_contents_from_filename(
filename=local_path, policy=predefined_acl)
except BotoServerError, e:
e.body = (repr(e.body) +
@@ -267,13 +302,13 @@
needed to create dest_path
"""
b = self._connect_to_bucket(bucket_name=source_bucket)
- item = Key(b)
- item.key = source_path
+ key = Key(b)
+ key.name = source_path
if create_subdirs_if_needed:
_makedirs_if_needed(os.path.dirname(dest_path))
with open(dest_path, 'w') as f:
try:
- item.get_contents_to_file(fp=f)
+ key.get_contents_to_file(fp=f)
except BotoServerError, e:
e.body = (repr(e.body) +
' while downloading bucket=%s, path=%s to local_path=%s' % (
@@ -302,16 +337,16 @@
bucket=source_bucket, subdir=source_dir)
for filename in files:
- item = Key(b)
- item.key = posixpath.join(source_dir, filename)
+ key = Key(b)
+ key.name = posixpath.join(source_dir, filename)
dest_path = os.path.join(dest_dir, filename)
with open(dest_path, 'w') as f:
try:
- item.get_contents_to_file(fp=f)
+ key.get_contents_to_file(fp=f)
except BotoServerError, e:
e.body = (repr(e.body) +
' while downloading bucket=%s, path=%s to local_path=%s' % (
- source_bucket, item.key, dest_path))
+ source_bucket, key.name, dest_path))
raise
for dirname in dirs:
@@ -431,13 +466,13 @@
prefix_length = len(prefix) if prefix else 0
b = self._connect_to_bucket(bucket_name=bucket)
- lister = BucketListResultSet(bucket=b, prefix=prefix, delimiter='/')
+ items = BucketListResultSet(bucket=b, prefix=prefix, delimiter='/')
dirs = []
files = []
- for item in lister:
+ for item in items:
t = type(item)
if t is Key:
- files.append(item.key[prefix_length:])
+ files.append(item.name[prefix_length:])
elif t is Prefix:
dirs.append(item.name[prefix_length:-1])
return (dirs, files)
@@ -500,3 +535,14 @@
except OSError as e:
if e.errno != errno.EEXIST:
raise
+
+
+def _get_local_md5(path):
+ """Returns the MD5 hash of a file on local disk."""
+ hasher = hashlib.md5()
+ with open(path, 'rb') as f:
+ while True:
+ data = f.read(64*1024)
+ if not data:
+ return hasher.hexdigest()
+ hasher.update(data)
diff --git a/py/utils/gs_utils_manualtest.py b/py/utils/gs_utils_manualtest.py
index db70267..a5258d0 100755
--- a/py/utils/gs_utils_manualtest.py
+++ b/py/utils/gs_utils_manualtest.py
@@ -12,6 +12,7 @@
import shutil
import sys
import tempfile
+import time
# Local imports.
import gs_utils
@@ -44,6 +45,63 @@
gs.list_bucket_contents(bucket=TEST_BUCKET, subdir=None)
+def _test_only_if_modified():
+ """Test only_if_modified param within upload_file()."""
+ gs = _get_authenticated_gs_handle()
+ filename = 'filename'
+ remote_dir = _get_unique_posix_dir()
+ dest_path = posixpath.join(remote_dir, filename)
+ local_dir = tempfile.mkdtemp()
+ try:
+ # Create a file on local disk, and upload it for the first time.
+ local_path = os.path.join(local_dir, filename)
+ with open(local_path, 'w') as f:
+ f.write('original contents')
+ gs.upload_file(source_path=local_path, dest_bucket=TEST_BUCKET,
+ dest_path=dest_path, only_if_modified=True)
+ try:
+ # Re-upload the same file one second later, with only_if_modified=False;
+ # the timestamp should change.
+ old_timestamp = gs.get_last_modified_time(
+ bucket=TEST_BUCKET, path=dest_path)
+ time.sleep(2)
+ gs.upload_file(source_path=local_path, dest_bucket=TEST_BUCKET,
+ dest_path=dest_path, only_if_modified=False)
+ new_timestamp = gs.get_last_modified_time(
+ bucket=TEST_BUCKET, path=dest_path)
+ assert old_timestamp != new_timestamp, '%s != %s' % (
+ old_timestamp, new_timestamp)
+
+ # Re-upload the same file one second later, with only_if_modified=True;
+ # the timestamp should NOT change.
+ old_timestamp = new_timestamp
+ time.sleep(2)
+ gs.upload_file(source_path=local_path, dest_bucket=TEST_BUCKET,
+ dest_path=dest_path, only_if_modified=True)
+ new_timestamp = gs.get_last_modified_time(
+ bucket=TEST_BUCKET, path=dest_path)
+ assert old_timestamp == new_timestamp, '%s == %s' % (
+ old_timestamp, new_timestamp)
+
+ # MODIFY and re-upload the file one second later, with
+ # only_if_modified=True; the timestamp SHOULD change.
+ old_timestamp = new_timestamp
+ with open(local_path, 'w') as f:
+ f.write('modified contents')
+ time.sleep(2)
+ gs.upload_file(source_path=local_path, dest_bucket=TEST_BUCKET,
+ dest_path=dest_path, only_if_modified=True)
+ new_timestamp = gs.get_last_modified_time(
+ bucket=TEST_BUCKET, path=dest_path)
+ assert old_timestamp != new_timestamp, '%s != %s' % (
+ old_timestamp, new_timestamp)
+ finally:
+ # Clean up the remote_dir.
+ gs.delete_file(bucket=TEST_BUCKET, path=dest_path)
+ finally:
+ # Clean up the local dir.
+ shutil.rmtree(local_dir)
+
def _test_authenticated_round_trip():
gs = _get_authenticated_gs_handle()
remote_dir = _get_unique_posix_dir()
@@ -206,6 +264,7 @@
if __name__ == '__main__':
+ _test_only_if_modified()
_test_public_read()
_test_authenticated_round_trip()
_test_dir_upload_and_download()