[Bazel] Support for a custom prebuilt emscripten cache (#1620)

This adds support for using a prebuilt cache as an archive instead of
building it from scratch every time.

This solves multiple problems:

- build speed on CI machines that have cold cache
- instead of building the secondary cache every time, it can simply
download it
- effective support for multiple caches (#1581)
- emscripten supports having multiple caches on its own, but it builds
`thinlto`, `lto`, and other cache combinations on demand. This doesn't
work in the Bazel world, where the cache needs to be frozen.
- the current solution allowed building a secondary cache, but only with
a single configuration
- this allows the use of any archive that contain any emscripten cache
- this PR does not handle how that cache is built - you can build it
with embuilder, zip it and serve it from your server or whatever you
want
- secondary cache hermeticity problem
- the current solution for secondary cache generated `emscripten_config`
file that contained full path to the cache.
- this was not hermetic as this path could be different on different
machines
- this PR calculates the cache path by using the same trick with
environment variables, ensuring that `emscripten_config` file is always
same, regardless of the machine
- additionally, it ensures that the cache is provided as an input to the
toolchain so that Bazel can correctly add it to the sandbox environment

After applying this patch to our internal codebase, the regular builds
of WASM code went down from 12-15 minutes to cca 30 seconds, regardless
on which machine it runs, as long as remote cache is warm.

---------

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
diff --git a/bazel/README.md b/bazel/README.md
index cc4121e..1212b05 100644
--- a/bazel/README.md
+++ b/bazel/README.md
@@ -87,3 +87,23 @@
 ```
 
 See `test_external/` for an example using [embind](https://emscripten.org/docs/porting/connecting_cpp_and_javascript/embind.html).
+
+Alternatively, you can use the embuilder to build the cache manually and put it into
+an archive that you serve from your HTTP server. Then you can declare it in your
+`MODULE.bazel` as follows:
+
+```starlark
+
+emscripten_cache = use_extension(
+    "@emsdk//:emscripten_cache.bzl",
+    "emscripten_cache",
+)
+
+emscripten_cache.prebuilt_cache(
+    http_archive_url = "https://my-host.com/my-emsdk-cache-4.0.16.tar.gz",
+    sha256 = "3e88abcbd22bac7b05af416c8f1859d12572c8e9356db604a2768fcfda863da8",
+    strip_prefix = "my-emsdk-cache",
+)
+```
+
+You cannot use both `prebuilt_cache` and `configuration`/`targets` at the same time. If you try to do so, `prebuilt_cache` will take precedence.
diff --git a/bazel/emscripten_build_file.bzl b/bazel/emscripten_build_file.bzl
index 0d7aff2..2cf7d95 100644
--- a/bazel/emscripten_build_file.bzl
+++ b/bazel/emscripten_build_file.bzl
@@ -19,6 +19,13 @@
 )
 
 filegroup(
+    name = "builtin_cache",
+    srcs = glob([
+        "emscripten/cache/**",
+    ]),
+)
+
+filegroup(
     name = "emcc_common",
     srcs = [
         "emscripten/emcc.py",
diff --git a/bazel/emscripten_cache.bzl b/bazel/emscripten_cache.bzl
index 945c19c..06ebb17 100644
--- a/bazel/emscripten_cache.bzl
+++ b/bazel/emscripten_cache.bzl
@@ -3,6 +3,21 @@
 exports_files(['emscripten_config'])
 """
 
+BUILD_FILE_USE_BUILTIN_CACHE = """
+alias(
+    name = "emscripten_cache",
+    actual = "{}//:builtin_cache",
+)
+"""
+
+BUILD_FILE_USE_SECONDARY_CACHE = """
+filegroup(
+    name = "emscripten_cache",
+    srcs = glob(["cache/**"]),
+    visibility = ["//visibility:public"],
+)
+"""
+
 EMBUILDER_CONFIG_TEMPLATE = """
 CACHE = '{cache}'
 BINARYEN_ROOT = '{binaryen_root}'
@@ -29,6 +44,26 @@
     else:
         fail("Unsupported operating system")
 
+def get_bin_deps_repo_name(repository_ctx):
+    if repository_ctx.os.name.startswith("linux"):
+        if "amd64" in repository_ctx.os.arch or "x86_64" in repository_ctx.os.arch:
+            return "@emscripten_bin_linux"
+        elif "aarch64" in repository_ctx.os.arch:
+            return "@emscripten_bin_linux_arm64"
+        else:
+            fail("Unsupported architecture for Linux")
+    elif repository_ctx.os.name.startswith("mac"):
+        if "amd64" in repository_ctx.os.arch or "x86_64" in repository_ctx.os.arch:
+            return "@emscripten_bin_mac"
+        elif "aarch64" in repository_ctx.os.arch:
+            return "@emscripten_bin_mac_arm64"
+        else:
+            fail("Unsupported architecture for MacOS")
+    elif repository_ctx.os.name.startswith("windows"):
+        return "@emscripten_bin_win"
+    else:
+        fail("Unsupported operating system")
+
 def _emscripten_cache_repository_impl(repository_ctx):
     # Read the default emscripten configuration file
     default_config = repository_ctx.read(
@@ -37,7 +72,26 @@
         ),
     )
 
-    if repository_ctx.attr.targets or repository_ctx.attr.configuration:
+    repo_metadata = None
+    build_file_content = BUILD_FILE_CONTENT_TEMPLATE
+    use_builtin_cache = True
+
+    if repository_ctx.attr.prebuilt_cache_url:
+        repository_ctx.download_and_extract(
+            url = repository_ctx.attr.prebuilt_cache_url,
+            output = "cache",
+            sha256 = repository_ctx.attr.prebuilt_cache_sha256,
+            stripPrefix = repository_ctx.attr.prebuilt_cache_strip_prefix,
+        )
+
+        # Use the prebuilt cache
+        use_builtin_cache = False
+
+        # Bazel 7 does not have the repo_metadata API, so prebuilt cache on Bazel 7 will not be marked as reproducible. This is not ideal, but it is a limitation of Bazel 7.
+        if hasattr(repository_ctx, "repo_metadata"):
+            repo_metadata = repository_ctx.repo_metadata(reproducible = True)
+
+    elif repository_ctx.attr.targets or repository_ctx.attr.configuration:
         root, script_ext = get_root_and_script_ext(repository_ctx)
         llvm_root = root.get_child("bin")
         cache = repository_ctx.path("cache")
@@ -65,7 +119,7 @@
         repository_ctx.report_progress("Building secondary cache")
         result = repository_ctx.execute(
             embuilder_args,
-            quiet = True,
+            quiet = False,
             environment = {
                 "EM_IGNORE_SANITY": "1",
                 "EM_NODE_JS": "empty",
@@ -74,40 +128,70 @@
         if result.return_code != 0:
             fail("Embuilder exited with a non-zero return code")
 
-        # Override Emscripten's cache with the secondary cache
-        default_config += "CACHE = '{}'\n".format(cache)
+        use_builtin_cache = False
+
+    if use_builtin_cache:
+        build_file_content += BUILD_FILE_USE_BUILTIN_CACHE.format(get_bin_deps_repo_name(repository_ctx))
+    else:
+        default_config += 'CACHE = os.path.join(os.path.dirname(os.environ["EM_CONFIG_PATH"]), "cache")\n'
+        build_file_content += BUILD_FILE_USE_SECONDARY_CACHE
 
     # Create the configuration file for the toolchain and export
     repository_ctx.file("emscripten_config", default_config)
-    repository_ctx.file("BUILD.bazel", BUILD_FILE_CONTENT_TEMPLATE)
+    repository_ctx.file("BUILD.bazel", build_file_content)
+
+    return repo_metadata
 
 _emscripten_cache_repository = repository_rule(
     implementation = _emscripten_cache_repository_impl,
     attrs = {
         "configuration": attr.string_list(),
         "targets": attr.string_list(),
+        "prebuilt_cache_url": attr.string(),
+        "prebuilt_cache_sha256": attr.string(),
+        "prebuilt_cache_strip_prefix": attr.string(),
     },
 )
 
 def _emscripten_cache_impl(ctx):
     all_configuration = []
     all_targets = []
+
+    prebuilt_cache_url = ""
+    prebuilt_cache_sha256 = ""
+    prebuilt_cache_strip_prefix = ""
+    prebuilt_cache_seen = False
     for mod in ctx.modules:
         for configuration in mod.tags.configuration:
             all_configuration += configuration.flags
         for targets in mod.tags.targets:
             all_targets += targets.targets
+        for prebuilt_cache in mod.tags.prebuilt_cache:
+            if prebuilt_cache_seen:
+                fail("Only one prebuilt_cache tag is allowed")
+            prebuilt_cache_url = prebuilt_cache.http_archive_url
+            prebuilt_cache_sha256 = prebuilt_cache.sha256
+            prebuilt_cache_strip_prefix = prebuilt_cache.strip_prefix
+            prebuilt_cache_seen = True
 
     _emscripten_cache_repository(
         name = "emscripten_cache",
         configuration = all_configuration,
         targets = all_targets,
+        prebuilt_cache_url = prebuilt_cache_url,
+        prebuilt_cache_sha256 = prebuilt_cache_sha256,
+        prebuilt_cache_strip_prefix = prebuilt_cache_strip_prefix,
     )
 
 emscripten_cache = module_extension(
     tag_classes = {
         "configuration": tag_class(attrs = {"flags": attr.string_list()}),
         "targets": tag_class(attrs = {"targets": attr.string_list()}),
+        "prebuilt_cache": tag_class(attrs = {
+            "http_archive_url": attr.string(mandatory = True),
+            "sha256": attr.string(mandatory = True),
+            "strip_prefix": attr.string(),
+        }),
     },
     implementation = _emscripten_cache_impl,
 )
diff --git a/bazel/remote_emscripten_repository.bzl b/bazel/remote_emscripten_repository.bzl
index 47308f2..e79fe95 100644
--- a/bazel/remote_emscripten_repository.bzl
+++ b/bazel/remote_emscripten_repository.bzl
@@ -63,6 +63,7 @@
         name = common_files_name,
         srcs = [
             "@emscripten_cache//:emscripten_config",
+            "@emscripten_cache//:emscripten_cache",
             "@emsdk//emscripten_toolchain:env.sh",
             "@emsdk//emscripten_toolchain:env.bat",
             "@rules_nodejs//nodejs:current_node_toolchain",
diff --git a/bazel/test_external/.bazelrc b/bazel/test_external/.bazelrc
deleted file mode 100644
index fbd75a7..0000000
--- a/bazel/test_external/.bazelrc
+++ /dev/null
@@ -1 +0,0 @@
-build --incompatible_enable_cc_toolchain_resolution
diff --git a/bazel/test_prebuilt_cache/.gitignore b/bazel/test_prebuilt_cache/.gitignore
new file mode 100644
index 0000000..dde5aa1
--- /dev/null
+++ b/bazel/test_prebuilt_cache/.gitignore
@@ -0,0 +1,4 @@
+bazel-bin
+bazel-out
+bazel-test_prebuilt_cache
+bazel-testlogs
diff --git a/bazel/test_prebuilt_cache/BUILD b/bazel/test_prebuilt_cache/BUILD
new file mode 100644
index 0000000..f787346
--- /dev/null
+++ b/bazel/test_prebuilt_cache/BUILD
@@ -0,0 +1,23 @@
+load("@emsdk//emscripten_toolchain:wasm_rules.bzl", "wasm_cc_binary")
+load("@rules_cc//cc:defs.bzl", "cc_binary")
+
+cc_binary(
+    name = "hello-world",
+    srcs = ["hello-world.cc"],
+    copts = [
+        "-flto=thin",
+    ],
+    linkopts = [
+        "-sAUTO_NATIVE_LIBRARIES=0",
+        "-flto=thin",
+    ],
+)
+
+wasm_cc_binary(
+    name = "hello-world-wasm",
+    cc_target = ":hello-world",
+    outputs = [
+        "hello-world.js",
+        "hello-world.wasm",
+    ],
+)
diff --git a/bazel/test_prebuilt_cache/MODULE.bazel b/bazel/test_prebuilt_cache/MODULE.bazel
new file mode 100644
index 0000000..1d9fd74
--- /dev/null
+++ b/bazel/test_prebuilt_cache/MODULE.bazel
@@ -0,0 +1,29 @@
+bazel_dep(name = "rules_cc", version = "0.2.16")
+bazel_dep(name = "emsdk")
+local_path_override(
+    module_name = "emsdk",
+    path = "..",
+)
+
+emscripten_deps = use_extension(
+    "@emsdk//:emscripten_deps.bzl",
+    "emscripten_deps",
+)
+
+# Need to use the same version of Emscripten as the prebuilt cache was built with to ensure compatibility even when future versions of Emscripten are released.
+emscripten_deps.config(
+    version = "5.0.7",
+)
+
+emscripten_cache = use_extension(
+    "@emsdk//:emscripten_cache.bzl",
+    "emscripten_cache",
+)
+
+# This cache was built with Emscripten 5.0.7, and contains opt-thinlto build.
+# It has been built according to these instructions: https://github.com/DoDoENT/bazel-playground/blob/master/tools/emsdk-cache-builder/README.md
+emscripten_cache.prebuilt_cache(
+    http_archive_url = "https://github.com/DoDoENT/bazel-playground/releases/download/emsdk-cache/emsdk-cache-5.0.7.tar.gz",
+    sha256 = "bbfdab09ae64769c4aa977b1d88f776490c459c87e7ed7e31fe00190cd56b14a",
+    strip_prefix = "emsdk-cache",
+)
diff --git a/bazel/test_prebuilt_cache/hello-world.cc b/bazel/test_prebuilt_cache/hello-world.cc
new file mode 100644
index 0000000..ee72c53
--- /dev/null
+++ b/bazel/test_prebuilt_cache/hello-world.cc
@@ -0,0 +1,6 @@
+#include <iostream>
+
+int main(int argc, char** argv) {
+  std::cout << "hello world!" << std::endl;
+  return 0;
+}
diff --git a/bazel/test_secondary_lto_cache/.bazelrc b/bazel/test_secondary_lto_cache/.bazelrc
deleted file mode 100644
index fbd75a7..0000000
--- a/bazel/test_secondary_lto_cache/.bazelrc
+++ /dev/null
@@ -1 +0,0 @@
-build --incompatible_enable_cc_toolchain_resolution
diff --git a/test/test_bazel.ps1 b/test/test_bazel.ps1
index 5c20a15..143c23c 100644
--- a/test/test_bazel.ps1
+++ b/test/test_bazel.ps1
@@ -28,3 +28,7 @@
 bazel build //:hello-world-wasm
 if (-not $?) { Exit $LastExitCode }
 
+Set-Location ..\test_prebuilt_cache
+
+bazel build //:hello-world-wasm --compilation_mode opt # test only release as used prebuilt cache is only for release builds
+if (-not $?) { Exit $LastExitCode }
diff --git a/test/test_bazel.sh b/test/test_bazel.sh
index 7a1d7a2..b2a198e 100755
--- a/test/test_bazel.sh
+++ b/test/test_bazel.sh
@@ -37,3 +37,7 @@
 pushd test_secondary_lto_cache
 bazel build //:hello-world-wasm
 popd
+
+pushd test_prebuilt_cache
+bazel build //:hello-world-wasm --compilation_mode opt # test only release as used prebuilt cache is only for release builds
+popd
diff --git a/test/test_bazel_mac.sh b/test/test_bazel_mac.sh
index e27c657..8be9f90 100755
--- a/test/test_bazel_mac.sh
+++ b/test/test_bazel_mac.sh
@@ -29,3 +29,5 @@
 cd ../test_secondary_lto_cache
 bazel build //:hello-world-wasm
 
+cd ../test_prebuilt_cache
+bazel build //:hello-world-wasm --compilation_mode opt # test only release as used prebuilt cache is only for release builds