add only_if_modified param to GSUtils.upload_file()

Needed so that the file history within gs://chromium-skia-skp-summaries will indicate when results changed.

BUG=skia:1942
R=borenet@google.com

Review URL: https://codereview.chromium.org/411723002
diff --git a/py/utils/gs_utils.py b/py/utils/gs_utils.py
index d49b3fe..1d39ef7 100644
--- a/py/utils/gs_utils.py
+++ b/py/utils/gs_utils.py
@@ -18,6 +18,7 @@
 
 # System-level imports
 import errno
+import hashlib
 import os
 import posixpath
 import re
@@ -139,27 +140,51 @@
       path: full path (Posix-style) of the file within the bucket to delete
     """
     b = self._connect_to_bucket(bucket_name=bucket)
-    item = Key(b)
-    item.key = path
+    key = Key(b)
+    key.name = path
     try:
-      item.delete()
+      key.delete()
     except BotoServerError, e:
       e.body = (repr(e.body) +
                 ' while deleting bucket=%s, path=%s' % (bucket, path))
       raise
 
-  def upload_file(self, source_path, dest_bucket, dest_path,
-                  predefined_acl=None, fine_grained_acl_list=None):
-    """Upload contents of a local file to Google Storage.
+  def get_last_modified_time(self, bucket, path):
+    """Gets the timestamp of when this file was last modified.
 
-    TODO(epoger): Add the only_if_modified param provided by upload_file() in
-    https://github.com/google/skia-buildbot/blob/master/slave/skia_slave_scripts/utils/old_gs_utils.py ,
-    so we can replace that function with this one.
+    Params:
+      bucket: GS bucket in which to look for the file
+      path: full path (Posix-style) of the file within the bucket to check
+
+    Returns the last modified time, as a freeform string.  If the file was not
+    found, returns None.
+    """
+    b = self._connect_to_bucket(bucket_name=bucket)
+    try:
+      key = b.get_key(key_name=path)
+      if not key:
+        return None
+      return key.last_modified
+    except BotoServerError, e:
+      e.body = (repr(e.body) +
+                ' while getting attributes of bucket=%s, path=%s' % (
+                    bucket, path))
+      raise
+
+  def upload_file(self, source_path, dest_bucket, dest_path,
+                  only_if_modified=False, predefined_acl=None,
+                  fine_grained_acl_list=None):
+    """Upload contents of a local file to Google Storage.
 
     params:
       source_path: full path (local-OS-style) on local disk to read from
       dest_bucket: GCS bucket to copy the file to
       dest_path: full path (Posix-style) within that bucket
+      only_if_modified: if True, only upload the file if it would actually
+          change the content on Google Storage (uploads the file if dest_path
+          does not exist, or if it exists but has different contents than
+          source_path).  Note that this may take longer than just uploading the
+          file without checking first, due to extra round-trips!
       predefined_acl: which predefined ACL to apply to the file on Google
           Storage; must be one of the PredefinedACL values defined above.
           If None, inherits dest_bucket's default object ACL.
@@ -170,22 +195,32 @@
           or None if predefined_acl is sufficient
     """
     b = self._connect_to_bucket(bucket_name=dest_bucket)
-    item = Key(b)
-    item.key = dest_path
+
+    if only_if_modified:
+      old_key = b.get_key(key_name=dest_path)
+      if old_key:
+        local_md5 = '"%s"' % _get_local_md5(path=source_path)
+        if local_md5 == old_key.etag:
+          print 'Skipping upload of unmodified file %s : %s' % (
+              source_path, local_md5)
+          return
+
+    key = Key(b)
+    key.name = dest_path
     try:
-      item.set_contents_from_filename(filename=source_path,
-                                      policy=predefined_acl)
+      key.set_contents_from_filename(filename=source_path,
+                                     policy=predefined_acl)
     except BotoServerError, e:
       e.body = (repr(e.body) +
                 ' while uploading source_path=%s to bucket=%s, path=%s' % (
-                    source_path, dest_bucket, item.key))
+                    source_path, dest_bucket, key.name))
       raise
     # TODO(epoger): This may be inefficient, because it calls
     # _connect_to_bucket() again.  Depending on how expensive that
     # call is, we may want to optimize this.
     for (id_type, id_value, permission) in fine_grained_acl_list or []:
       self.set_acl(
-          bucket=dest_bucket, path=item.key,
+          bucket=dest_bucket, path=key.name,
           id_type=id_type, id_value=id_value, permission=permission)
 
   def upload_dir_contents(self, source_dir, dest_bucket, dest_dir,
@@ -237,10 +272,10 @@
             predefined_acl=predefined_acl,
             fine_grained_acl_list=fine_grained_acl_list)
       else:
-        item = Key(b)
-        item.key = remote_path
+        key = Key(b)
+        key.name = remote_path
         try:
-          item.set_contents_from_filename(
+          key.set_contents_from_filename(
               filename=local_path, policy=predefined_acl)
         except BotoServerError, e:
           e.body = (repr(e.body) +
@@ -267,13 +302,13 @@
           needed to create dest_path
     """
     b = self._connect_to_bucket(bucket_name=source_bucket)
-    item = Key(b)
-    item.key = source_path
+    key = Key(b)
+    key.name = source_path
     if create_subdirs_if_needed:
       _makedirs_if_needed(os.path.dirname(dest_path))
     with open(dest_path, 'w') as f:
       try:
-        item.get_contents_to_file(fp=f)
+        key.get_contents_to_file(fp=f)
       except BotoServerError, e:
         e.body = (repr(e.body) +
                   ' while downloading bucket=%s, path=%s to local_path=%s' % (
@@ -302,16 +337,16 @@
         bucket=source_bucket, subdir=source_dir)
 
     for filename in files:
-      item = Key(b)
-      item.key = posixpath.join(source_dir, filename)
+      key = Key(b)
+      key.name = posixpath.join(source_dir, filename)
       dest_path = os.path.join(dest_dir, filename)
       with open(dest_path, 'w') as f:
         try:
-          item.get_contents_to_file(fp=f)
+          key.get_contents_to_file(fp=f)
         except BotoServerError, e:
           e.body = (repr(e.body) +
                     ' while downloading bucket=%s, path=%s to local_path=%s' % (
-                        source_bucket, item.key, dest_path))
+                        source_bucket, key.name, dest_path))
           raise
 
     for dirname in dirs:
@@ -431,13 +466,13 @@
     prefix_length = len(prefix) if prefix else 0
 
     b = self._connect_to_bucket(bucket_name=bucket)
-    lister = BucketListResultSet(bucket=b, prefix=prefix, delimiter='/')
+    items = BucketListResultSet(bucket=b, prefix=prefix, delimiter='/')
     dirs = []
     files = []
-    for item in lister:
+    for item in items:
       t = type(item)
       if t is Key:
-        files.append(item.key[prefix_length:])
+        files.append(item.name[prefix_length:])
       elif t is Prefix:
         dirs.append(item.name[prefix_length:-1])
     return (dirs, files)
@@ -500,3 +535,14 @@
   except OSError as e:
     if e.errno != errno.EEXIST:
       raise
+
+
+def _get_local_md5(path):
+  """Returns the MD5 hash of a file on local disk."""
+  hasher = hashlib.md5()
+  with open(path, 'rb') as f:
+    while True:
+      data = f.read(64*1024)
+      if not data:
+        return hasher.hexdigest()
+      hasher.update(data)
diff --git a/py/utils/gs_utils_manualtest.py b/py/utils/gs_utils_manualtest.py
index db70267..a5258d0 100755
--- a/py/utils/gs_utils_manualtest.py
+++ b/py/utils/gs_utils_manualtest.py
@@ -12,6 +12,7 @@
 import shutil
 import sys
 import tempfile
+import time
 
 # Local imports.
 import gs_utils
@@ -44,6 +45,63 @@
   gs.list_bucket_contents(bucket=TEST_BUCKET, subdir=None)
 
 
+def _test_only_if_modified():
+  """Test only_if_modified param within upload_file()."""
+  gs = _get_authenticated_gs_handle()
+  filename = 'filename'
+  remote_dir = _get_unique_posix_dir()
+  dest_path = posixpath.join(remote_dir, filename)
+  local_dir = tempfile.mkdtemp()
+  try:
+    # Create a file on local disk, and upload it for the first time.
+    local_path = os.path.join(local_dir, filename)
+    with open(local_path, 'w') as f:
+      f.write('original contents')
+    gs.upload_file(source_path=local_path, dest_bucket=TEST_BUCKET,
+                   dest_path=dest_path, only_if_modified=True)
+    try:
+      # Re-upload the same file one second later, with only_if_modified=False;
+      # the timestamp should change.
+      old_timestamp = gs.get_last_modified_time(
+          bucket=TEST_BUCKET, path=dest_path)
+      time.sleep(2)
+      gs.upload_file(source_path=local_path, dest_bucket=TEST_BUCKET,
+                     dest_path=dest_path, only_if_modified=False)
+      new_timestamp = gs.get_last_modified_time(
+          bucket=TEST_BUCKET, path=dest_path)
+      assert old_timestamp != new_timestamp, '%s != %s' % (
+          old_timestamp, new_timestamp)
+
+      # Re-upload the same file one second later, with only_if_modified=True;
+      # the timestamp should NOT change.
+      old_timestamp = new_timestamp
+      time.sleep(2)
+      gs.upload_file(source_path=local_path, dest_bucket=TEST_BUCKET,
+                     dest_path=dest_path, only_if_modified=True)
+      new_timestamp = gs.get_last_modified_time(
+          bucket=TEST_BUCKET, path=dest_path)
+      assert old_timestamp == new_timestamp, '%s == %s' % (
+          old_timestamp, new_timestamp)
+
+      # MODIFY and re-upload the file one second later, with
+      # only_if_modified=True; the timestamp SHOULD change.
+      old_timestamp = new_timestamp
+      with open(local_path, 'w') as f:
+        f.write('modified contents')
+      time.sleep(2)
+      gs.upload_file(source_path=local_path, dest_bucket=TEST_BUCKET,
+                     dest_path=dest_path, only_if_modified=True)
+      new_timestamp = gs.get_last_modified_time(
+          bucket=TEST_BUCKET, path=dest_path)
+      assert old_timestamp != new_timestamp, '%s != %s' % (
+          old_timestamp, new_timestamp)
+    finally:
+      # Clean up the remote_dir.
+      gs.delete_file(bucket=TEST_BUCKET, path=dest_path)
+  finally:
+    # Clean up the local dir.
+    shutil.rmtree(local_dir)
+
 def _test_authenticated_round_trip():
   gs = _get_authenticated_gs_handle()
   remote_dir = _get_unique_posix_dir()
@@ -206,6 +264,7 @@
 
 
 if __name__ == '__main__':
+  _test_only_if_modified()
   _test_public_read()
   _test_authenticated_round_trip()
   _test_dir_upload_and_download()