add upload_dir_contents() and download_dir_contents() to gs_utils.py
(with the ability to set fine-grained ACLs as we need when uploading SKPs)

BUG=skia:2618,skia:1942
R=rmistry@google.com

Review URL: https://codereview.chromium.org/407533002
diff --git a/py/utils/gs_utils.py b/py/utils/gs_utils.py
index 031a0c3..ecbe68c 100755
--- a/py/utils/gs_utils.py
+++ b/py/utils/gs_utils.py
@@ -36,6 +36,7 @@
     # We need to insert at the beginning of the path, to make sure that our
     # imported versions are favored over others that might be in the path.
     sys.path.insert(0, import_dirpath)
+from boto.exception import BotoServerError
 from boto.gs import acl
 from boto.gs.bucket import Bucket
 from boto.gs.connection import GSConnection
@@ -44,20 +45,33 @@
 from boto.s3.connection import SubdomainCallingFormat
 from boto.s3.prefix import Prefix
 
-# Permissions that may be set on each file in Google Storage.
-# See SupportedPermissions in
+# Predefined (aka "canned") ACLs that provide a "base coat" of permissions for
+# each file in Google Storage.  See CannedACLStrings in
 # https://github.com/boto/boto/blob/develop/boto/gs/acl.py
+# Also see https://developers.google.com/storage/docs/accesscontrol
+PREDEFINED_ACL_AUTHENTICATED_READ        = 'authenticated-read'
+PREDEFINED_ACL_BUCKET_OWNER_FULL_CONTROL = 'bucket-owner-full-control'
+PREDEFINED_ACL_BUCKET_OWNER_READ         = 'bucket-owner-read'
+PREDEFINED_ACL_PRIVATE                   = 'private'
+PREDEFINED_ACL_PROJECT_PRIVATE           = 'project-private'
+PREDEFINED_ACL_PUBLIC_READ               = 'public-read'
+PREDEFINED_ACL_PUBLIC_READ_WRITE         = 'public-read-write'
+
+# "Fine-grained" permissions that may be set per user/group on each file in
+# Google Storage.  See SupportedPermissions in
+# https://github.com/boto/boto/blob/develop/boto/gs/acl.py
+# Also see https://developers.google.com/storage/docs/accesscontrol
 PERMISSION_NONE  = None
 PERMISSION_OWNER = 'FULL_CONTROL'
 PERMISSION_READ  = 'READ'
 PERMISSION_WRITE = 'WRITE'
 
-# Types of identifiers we can use to set ACLs.
+# Types of identifiers we can use to set "fine-grained" ACLs.
 ID_TYPE_GROUP_BY_DOMAIN = acl.GROUP_BY_DOMAIN
-ID_TYPE_GROUP_BY_EMAIL = acl.GROUP_BY_EMAIL
-ID_TYPE_GROUP_BY_ID = acl.GROUP_BY_ID
-ID_TYPE_USER_BY_EMAIL = acl.USER_BY_EMAIL
-ID_TYPE_USER_BY_ID = acl.USER_BY_ID
+ID_TYPE_GROUP_BY_EMAIL  = acl.GROUP_BY_EMAIL
+ID_TYPE_GROUP_BY_ID     = acl.GROUP_BY_ID
+ID_TYPE_USER_BY_EMAIL   = acl.USER_BY_EMAIL
+ID_TYPE_USER_BY_ID      = acl.USER_BY_ID
 
 # Which field we get/set in ACL entries, depending on ID_TYPE.
 FIELD_BY_ID_TYPE = {
@@ -120,16 +134,21 @@
       bucket: GS bucket to delete a file from
       path: full path (Posix-style) of the file within the bucket to delete
     """
-    conn = self._create_connection()
-    b = conn.get_bucket(bucket_name=bucket)
+    b = self._connect_to_bucket(bucket_name=bucket)
     item = Key(b)
     item.key = path
-    item.delete()
+    try:
+      item.delete()
+    except BotoServerError, e:
+      e.body = (repr(e.body) +
+                ' while deleting bucket=%s, path=%s' % (bucket, path))
+      raise
 
-  def upload_file(self, source_path, dest_bucket, dest_path):
+  def upload_file(self, source_path, dest_bucket, dest_path,
+                  predefined_acl=None, fine_grained_acl_list=None):
     """Upload contents of a local file to Google Storage.
 
-    TODO(epoger): Add the extra parameters provided by upload_file() within
+    TODO(epoger): Add the only_if_modified param provided by upload_file() in
     https://github.com/google/skia-buildbot/blob/master/slave/skia_slave_scripts/utils/old_gs_utils.py ,
     so we can replace that function with this one.
 
@@ -137,12 +156,96 @@
       source_path: full path (local-OS-style) on local disk to read from
       dest_bucket: GCS bucket to copy the file to
       dest_path: full path (Posix-style) within that bucket
+      predefined_acl: which predefined ACL to apply to the file on Google
+          Storage; must be one of the PREDEFINED_ACL_* constants defined above.
+          If None, inherits dest_bucket's default object ACL.
+          TODO(epoger): add unittests for this param, although it seems to work
+          in my manual testing
+      fine_grained_acl_list: list of (id_type, id_value, permission) tuples
+          to apply to the uploaded file (on top of the predefined_acl),
+          or None if predefined_acl is sufficient
     """
-    conn = self._create_connection()
-    b = conn.get_bucket(bucket_name=dest_bucket)
+    b = self._connect_to_bucket(bucket_name=dest_bucket)
     item = Key(b)
     item.key = dest_path
-    item.set_contents_from_filename(filename=source_path)
+    try:
+      item.set_contents_from_filename(filename=source_path,
+                                      policy=predefined_acl)
+    except BotoServerError, e:
+      e.body = (repr(e.body) +
+                ' while uploading source_path=%s to bucket=%s, path=%s' % (
+                    source_path, dest_bucket, item.key))
+      raise
+    # TODO(epoger): This may be inefficient, because it calls
+    # _connect_to_bucket() again.  Depending on how expensive that
+    # call is, we may want to optimize this.
+    for (id_type, id_value, permission) in fine_grained_acl_list or []:
+      self.set_acl(
+          bucket=dest_bucket, path=item.key,
+          id_type=id_type, id_value=id_value, permission=permission)
+
+  def upload_dir_contents(self, source_dir, dest_bucket, dest_dir,
+                          predefined_acl=None, fine_grained_acl_list=None):
+    """Recursively upload contents of a local directory to Google Storage.
+
+    params:
+      source_dir: full path (local-OS-style) on local disk of directory to copy
+          contents of
+      dest_bucket: GCS bucket to copy the files into
+      dest_dir: full path (Posix-style) within that bucket; write the files into
+          this directory
+      predefined_acl: which predefined ACL to apply to the files on Google
+          Storage; must be one of the PREDEFINED_ACL_* constants defined above.
+          If None, inherits dest_bucket's default object ACL.
+          TODO(epoger): add unittests for this param, although it seems to work
+          in my manual testing
+      fine_grained_acl_list: list of (id_type, id_value, permission) tuples
+          to apply to every file uploaded (on top of the predefined_acl),
+          or None if predefined_acl is sufficient
+          TODO(epoger): add unittests for this param, although it seems to work
+          in my manual testing
+
+    The copy operates as a "merge with overwrite": any files in source_dir will
+    be "overlaid" on top of the existing content in dest_dir.  Existing files
+    with the same names will be overwritten.
+
+    TODO(epoger): Upload multiple files simultaneously to reduce latency.
+
+    TODO(epoger): Add a "noclobber" mode that will not upload any files would
+    overwrite existing files in Google Storage.
+
+    TODO(epoger): Consider adding a do_compress parameter that would compress
+    the file using gzip before upload, and add a "Content-Encoding:gzip" header
+    so that HTTP downloads of the file would be unzipped automatically.
+    See https://developers.google.com/storage/docs/gsutil/addlhelp/
+        WorkingWithObjectMetadata#content-encoding
+    """
+    b = self._connect_to_bucket(bucket_name=dest_bucket)
+    for filename in sorted(os.listdir(source_dir)):
+      local_path = os.path.join(source_dir, filename)
+      if os.path.isdir(local_path):
+        self.upload_dir_contents(  # recurse
+            source_dir=local_path, dest_bucket=dest_bucket,
+            dest_dir=posixpath.join(dest_dir, filename),
+            predefined_acl=predefined_acl)
+      else:
+        item = Key(b)
+        item.key = posixpath.join(dest_dir, filename)
+        try:
+          item.set_contents_from_filename(
+              filename=local_path, policy=predefined_acl)
+        except BotoServerError, e:
+          e.body = (repr(e.body) +
+                    ' while uploading local_path=%s to bucket=%s, path=%s' % (
+                        local_path, dest_bucket, item.key))
+          raise
+        # TODO(epoger): This may be inefficient, because it calls
+        # _connect_to_bucket() for every file.  Depending on how expensive that
+        # call is, we may want to optimize this.
+        for (id_type, id_value, permission) in fine_grained_acl_list or []:
+          self.set_acl(
+              bucket=dest_bucket, path=item.key,
+              id_type=id_type, id_value=id_value, permission=permission)
 
   def download_file(self, source_bucket, source_path, dest_path,
                     create_subdirs_if_needed=False):
@@ -155,14 +258,59 @@
       create_subdirs_if_needed: boolean; whether to create subdirectories as
           needed to create dest_path
     """
-    conn = self._create_connection()
-    b = conn.get_bucket(bucket_name=source_bucket)
+    b = self._connect_to_bucket(bucket_name=source_bucket)
     item = Key(b)
     item.key = source_path
     if create_subdirs_if_needed:
       _makedirs_if_needed(os.path.dirname(dest_path))
     with open(dest_path, 'w') as f:
-      item.get_contents_to_file(fp=f)
+      try:
+        item.get_contents_to_file(fp=f)
+      except BotoServerError, e:
+        e.body = (repr(e.body) +
+                  ' while downloading bucket=%s, path=%s to local_path=%s' % (
+                      source_bucket, source_path, dest_path))
+        raise
+
+  def download_dir_contents(self, source_bucket, source_dir, dest_dir):
+    """Recursively download contents of a Google Storage directory to local disk
+
+    params:
+      source_bucket: GCS bucket to copy the files from
+      source_dir: full path (Posix-style) within that bucket; read the files
+          from this directory
+      dest_dir: full path (local-OS-style) on local disk of directory to copy
+          the files into
+
+    The copy operates as a "merge with overwrite": any files in source_dir will
+    be "overlaid" on top of the existing content in dest_dir.  Existing files
+    with the same names will be overwritten.
+
+    TODO(epoger): Download multiple files simultaneously to reduce latency.
+    """
+    _makedirs_if_needed(dest_dir)
+    b = self._connect_to_bucket(bucket_name=source_bucket)
+    (dirs, files) = self.list_bucket_contents(
+        bucket=source_bucket, subdir=source_dir)
+
+    for filename in files:
+      item = Key(b)
+      item.key = posixpath.join(source_dir, filename)
+      dest_path = os.path.join(dest_dir, filename)
+      with open(dest_path, 'w') as f:
+        try:
+          item.get_contents_to_file(fp=f)
+        except BotoServerError, e:
+          e.body = (repr(e.body) +
+                    ' while downloading bucket=%s, path=%s to local_path=%s' % (
+                        source_bucket, item.key, dest_path))
+          raise
+
+    for dirname in dirs:
+      self.download_dir_contents(  # recurse
+          source_bucket=source_bucket,
+          source_dir=posixpath.join(source_dir, dirname),
+          dest_dir=os.path.join(dest_dir, dirname))
 
   def get_acl(self, bucket, path, id_type, id_value):
     """Retrieve partial access permissions on a single file in Google Storage.
@@ -172,6 +320,9 @@
     rights based on *other* id_types (e.g., perhaps they have group access
     rights, beyond their individual access rights).
 
+    TODO(epoger): What if the remote file does not exist?  This should probably
+    raise an exception in that case.
+
     Params:
       bucket: GS bucket
       path: full path (Posix-style) to the file within that bucket
@@ -184,8 +335,7 @@
         permissions have been set.
     """
     field = FIELD_BY_ID_TYPE[id_type]
-    conn = self._create_connection()
-    b = conn.get_bucket(bucket_name=bucket)
+    b = self._connect_to_bucket(bucket_name=bucket)
     acls = b.get_acl(key_name=path)
     matching_entries = [entry for entry in acls.entries.entry_list
                         if (entry.scope.type == id_type) and
@@ -208,6 +358,9 @@
     If there is already a permission set on this file for this id_type/id_value
     combination, this call will overwrite it.
 
+    TODO(epoger): What if the remote file does not exist?  This should probably
+    raise an exception in that case.
+
     Params:
       bucket: GS bucket
       path: full path (Posix-style) to the file within that bucket
@@ -231,8 +384,7 @@
       assert PERMISSION_WRITE == get_acl(bucket, path, id_type, id_value)
     """
     field = FIELD_BY_ID_TYPE[id_type]
-    conn = self._create_connection()
-    b = conn.get_bucket(bucket_name=bucket)
+    b = self._connect_to_bucket(bucket_name=bucket)
     acls = b.get_acl(key_name=path)
 
     # Remove any existing entries that refer to the same id_type/id_value,
@@ -257,6 +409,9 @@
   def list_bucket_contents(self, bucket, subdir=None):
     """Returns files in the Google Storage bucket as a (dirs, files) tuple.
 
+    TODO(epoger): This should raise an exception if subdir does not exist in
+    Google Storage; right now, it just returns empty contents.
+
     Args:
       bucket: name of the Google Storage bucket
       subdir: directory within the bucket to list, or None for root directory
@@ -267,8 +422,7 @@
       prefix += '/'
     prefix_length = len(prefix) if prefix else 0
 
-    conn = self._create_connection()
-    b = conn.get_bucket(bucket_name=bucket)
+    b = self._connect_to_bucket(bucket_name=bucket)
     lister = BucketListResultSet(bucket=b, prefix=prefix, delimiter='/')
     dirs = []
     files = []
@@ -280,6 +434,18 @@
         dirs.append(item.name[prefix_length:-1])
     return (dirs, files)
 
+  def _connect_to_bucket(self, bucket_name):
+    """Returns a Bucket object we can use to access a particular bucket in GS.
+
+    Params:
+      bucket_name: name of the bucket (e.g., 'chromium-skia-gm')
+    """
+    try:
+      return self._create_connection().get_bucket(bucket_name=bucket_name)
+    except BotoServerError, e:
+      e.body = repr(e.body) + ' while connecting to bucket=%s' % bucket_name
+      raise
+
   def _create_connection(self):
     """Returns a GSConnection object we can use to access Google Storage."""
     if self._gs_access_key_id:
@@ -349,16 +515,26 @@
   subdir = 'subdir'
   filenames_to_upload = ['file1', 'file2']
 
-  # Upload test files to Google Storage.
+  # Upload test files to Google Storage, checking that their fine-grained
+  # ACLs were set correctly.
+  id_type = ID_TYPE_GROUP_BY_DOMAIN
+  id_value = 'chromium.org'
+  set_permission = PERMISSION_READ
   local_src_dir = tempfile.mkdtemp()
   os.mkdir(os.path.join(local_src_dir, subdir))
   try:
     for filename in filenames_to_upload:
       with open(os.path.join(local_src_dir, subdir, filename), 'w') as f:
         f.write('contents of %s\n' % filename)
-      gs.upload_file(source_path=os.path.join(local_src_dir, subdir, filename),
-                     dest_bucket=bucket,
-                     dest_path=posixpath.join(remote_dir, subdir, filename))
+      dest_path = posixpath.join(remote_dir, subdir, filename)
+      gs.upload_file(
+          source_path=os.path.join(local_src_dir, subdir, filename),
+          dest_bucket=bucket, dest_path=dest_path,
+          fine_grained_acl_list=[(id_type, id_value, set_permission)])
+      got_permission = gs.get_acl(bucket=bucket, path=dest_path,
+                                  id_type=id_type, id_value=id_value)
+      assert got_permission == set_permission, '%s == %s' % (
+          got_permission, set_permission)
   finally:
     shutil.rmtree(local_src_dir)
 
@@ -434,10 +610,68 @@
   assert files == [], '%s == []' % files
 
 
+def _test_dir_upload_and_download():
+  """Test upload_dir_contents() and download_dir_contents()."""
+  try:
+    gs = GSUtils(boto_file_path=os.path.expanduser(os.path.join('~','.boto')))
+  except:
+    print """
+Failed to instantiate GSUtils object with default .boto file path.
+Do you have a ~/.boto file that provides the credentials needed to read
+and write gs://chromium-skia-gm ?
+"""
+    raise
+
+  bucket = 'chromium-skia-gm'
+  remote_dir = 'gs_utils_test/%d' % random.randint(0, sys.maxint)
+  subdir = 'subdir'
+  filenames = ['file1', 'file2']
+
+  # Create directory tree on local disk, and upload it.
+  local_src_dir = tempfile.mkdtemp()
+  os.mkdir(os.path.join(local_src_dir, subdir))
+  try:
+    for filename in filenames:
+      with open(os.path.join(local_src_dir, subdir, filename), 'w') as f:
+        f.write('contents of %s\n' % filename)
+      gs.upload_dir_contents(source_dir=local_src_dir, dest_bucket=bucket,
+                             dest_dir=remote_dir)
+  finally:
+    shutil.rmtree(local_src_dir)
+
+  # Validate the list of the files we uploaded to Google Storage.
+  (dirs, files) = gs.list_bucket_contents(
+      bucket=bucket, subdir=remote_dir)
+  assert dirs == [subdir], '%s == [%s]' % (dirs, subdir)
+  assert files == [], '%s == []' % files
+  (dirs, files) = gs.list_bucket_contents(
+      bucket=bucket, subdir=posixpath.join(remote_dir, subdir))
+  assert dirs == [], '%s == []' % dirs
+  assert files == filenames, '%s == %s' % (files, filenames)
+
+  # Download the directory tree we just uploaded, make sure its contents
+  # are what we expect, and then delete the tree in Google Storage.
+  local_dest_dir = tempfile.mkdtemp()
+  try:
+    gs.download_dir_contents(source_bucket=bucket, source_dir=remote_dir,
+                             dest_dir=local_dest_dir)
+    for filename in filenames:
+      with open(os.path.join(local_dest_dir, subdir, filename)) as f:
+        file_contents = f.read()
+      assert file_contents == 'contents of %s\n' % filename, (
+          '%s == "contents of %s\n"' % (file_contents, filename))
+  finally:
+    shutil.rmtree(local_dest_dir)
+    for filename in filenames:
+      gs.delete_file(bucket=bucket,
+                     path=posixpath.join(remote_dir, subdir, filename))
+
+
 # TODO(epoger): How should we exercise these self-tests?
 # See http://skbug.com/2751
 if __name__ == '__main__':
   _test_public_read()
   _test_authenticated_round_trip()
+  _test_dir_upload_and_download()
   # TODO(epoger): Add _test_unauthenticated_access() to make sure we raise
   # an exception when we try to access without needed credentials.