gs_utils.upload_file(): upload with temp name, verify contents, then rename

I want to do this before we start uploading multiple files in parallel, so we will be protected against any failures.

BUG=skia:2780
R=borenet@google.com

Review URL: https://codereview.chromium.org/419773003
diff --git a/py/utils/gs_utils.py b/py/utils/gs_utils.py
index 7f3b59c..f53b4d1 100644
--- a/py/utils/gs_utils.py
+++ b/py/utils/gs_utils.py
@@ -160,7 +160,7 @@
       key.delete()
     except BotoServerError, e:
       e.body = (repr(e.body) +
-                ' while deleting bucket=%s, path=%s' % (b.name, path))
+                ' while deleting gs://%s/%s' % (b.name, path))
       raise
 
   def get_last_modified_time(self, bucket, path):
@@ -181,8 +181,7 @@
       return key.last_modified
     except BotoServerError, e:
       e.body = (repr(e.body) +
-                ' while getting attributes of bucket=%s, path=%s' % (
-                    b.name, path))
+                ' while getting attributes of gs://%s/%s' % (b.name, path))
       raise
 
   def upload_file(self, source_path, dest_bucket, dest_path,
@@ -211,6 +210,7 @@
         WorkingWithObjectMetadata#content-encoding
     """
     b = self._connect_to_bucket(bucket=dest_bucket)
+    local_md5 = None  # filled in lazily
 
     if upload_if == self.UploadIf.IF_NEW:
       old_key = b.get_key(key_name=dest_path)
@@ -221,27 +221,62 @@
     elif upload_if == self.UploadIf.IF_MODIFIED:
       old_key = b.get_key(key_name=dest_path)
       if old_key:
-        local_md5 = '"%s"' % _get_local_md5(path=source_path)
-        if local_md5 == old_key.etag:
+        if not local_md5:
+          local_md5 = _get_local_md5(path=source_path)
+        if ('"%s"' % local_md5) == old_key.etag:
           print 'Skipping upload of unmodified file gs://%s/%s : %s' % (
               b.name, dest_path, local_md5)
           return
     elif upload_if != self.UploadIf.ALWAYS:
       raise Exception('unknown value of upload_if: %s' % upload_if)
 
-    key = Key(b)
-    key.name = dest_path
+    # Upload the file using a temporary name at first, in case the transfer
+    # is interrupted partway through.
+    if not local_md5:
+      local_md5 = _get_local_md5(path=source_path)
+    initial_key = Key(b)
+    initial_key.name = dest_path + '-uploading-' + local_md5
     try:
-      key.set_contents_from_filename(filename=source_path,
-                                     policy=predefined_acl)
+      initial_key.set_contents_from_filename(filename=source_path,
+                                             policy=predefined_acl)
     except BotoServerError, e:
       e.body = (repr(e.body) +
-                ' while uploading source_path=%s to bucket=%s, path=%s' % (
-                    source_path, b.name, key.name))
+                ' while uploading source_path=%s to gs://%s/%s' % (
+                    source_path, b.name, initial_key.name))
       raise
+
+    # Verify that the file contents were uploaded successfully.
+    #
+    # TODO(epoger): Check whether the boto library or XML API already do this...
+    # if so, we may be duplicating effort here, and maybe we don't need to do
+    # the whole "upload using temporary filename, then rename" thing.
+    #
+    # TODO(epoger): Confirm that the etag is set on the server side...
+    # otherwise, we may just be validating another MD5 hash that was generated
+    # on the client side before the file was uploaded!
+    validate_key = b.get_key(key_name=initial_key.name)
+    if validate_key.etag != ('"%s"' % local_md5):
+      raise Exception('found wrong MD5 after uploading gs://%s/%s' % (
+          b.name, validate_key.name))
+
+    # Rename the file to its real name.
+    #
+    # TODO(epoger): I don't know how long this takes.  I wish we could rename
+    # the key instead, but AFAICT you can't do that.
+    # Perhaps we could use Key.compose() to create a composite object pointing
+    # at the original key?
+    # See https://developers.google.com/storage/docs/composite-objects
+    final_key = b.copy_key(
+        new_key_name=dest_path, src_key_name=initial_key.name,
+        src_bucket_name=b.name, preserve_acl=False)
+    initial_key.delete()
+
+    # Set ACLs on the file.
+    # We do this *after* copy_key(), because copy_key's preserve_acl
+    # functionality would incur a performance hit.
     for (id_type, id_value, permission) in fine_grained_acl_list or []:
       self.set_acl(
-          bucket=b, path=key.name,
+          bucket=b, path=final_key.name,
           id_type=id_type, id_value=id_value, permission=permission)
 
   def upload_dir_contents(self, source_dir, dest_bucket, dest_dir,
@@ -337,7 +372,7 @@
         key.get_contents_to_file(fp=f)
       except BotoServerError, e:
         e.body = (repr(e.body) +
-                  ' while downloading bucket=%s, path=%s to local_path=%s' % (
+                  ' while downloading gs://%s/%s to local_path=%s' % (
                       b.name, source_path, dest_path))
         raise
 
@@ -371,7 +406,7 @@
           key.get_contents_to_file(fp=f)
         except BotoServerError, e:
           e.body = (repr(e.body) +
-                    ' while downloading bucket=%s, path=%s to local_path=%s' % (
+                    ' while downloading gs://%s/%s to local_path=%s' % (
                         b.name, key.name, dest_path))
           raise