blob: 5d8d379a93341669ba68cc358a4d3d4525812987 [file] [log] [blame]
package main
/*
Program for automating creation and setup of Swarming bot VMs.
Bot numbers should be assigned as follows:
1-99 (skia-e-gce-0..): Temporary or experimental bots.
100-499 (skia-e-gce-[1234]..): Linux
100-199: linux-small
200-299: linux-medium
300-399: linux-large
400-499: linux-skylake
405-408: linux-amd
500-699 (skia-e-gce-[56]..): Windows
500-599: win-medium
600-699: win-large
700-999: unassigned
*/
import (
"bytes"
"context"
"encoding/json"
"flag"
"fmt"
"os"
"path"
"path/filepath"
"runtime"
"strings"
"go.skia.org/infra/go/common"
"go.skia.org/infra/go/exec"
"go.skia.org/infra/go/gce"
"go.skia.org/infra/go/gce/swarming/instance_types"
"go.skia.org/infra/go/skerr"
"go.skia.org/infra/go/sklog"
"go.skia.org/infra/go/util"
)
const (
kindExternal = "external"
kindInternal = "internal"
kindDev = "dev"
)
var (
instanceKinds = []string{
kindExternal,
kindInternal,
kindDev,
}
validInstanceTypes = []string{
instance_types.INSTANCE_TYPE_CT,
instance_types.INSTANCE_TYPE_LINUX_MICRO,
instance_types.INSTANCE_TYPE_LINUX_SMALL,
instance_types.INSTANCE_TYPE_LINUX_MEDIUM,
instance_types.INSTANCE_TYPE_LINUX_LARGE,
instance_types.INSTANCE_TYPE_LINUX_AMD,
instance_types.INSTANCE_TYPE_LINUX_SKYLAKE,
instance_types.INSTANCE_TYPE_WIN_MEDIUM,
instance_types.INSTANCE_TYPE_WIN_LARGE,
}
winInstanceTypes = []string{
instance_types.INSTANCE_TYPE_WIN_MEDIUM,
instance_types.INSTANCE_TYPE_WIN_LARGE,
}
ansibleDirectory = filepath.Join("skolo", "ansible")
linuxAnsiblePlaybook = filepath.Join("switchboard", "linux.yml")
winAnsiblePlaybook = filepath.Join("switchboard", "win.yml")
)
type validMachineRange struct {
min int
max int
kind string
expectedInstanceType string // If empty, any instance type is accepted.
}
func (r validMachineRange) matchesNumberAndKind(n int, kind string) bool {
return n >= r.min && n <= r.max && kind == r.kind
}
var validMachineRanges = []validMachineRange{
// skia-e-gce-* and skia-ct-gce-* machines.
{0, 99, kindExternal, ""}, // Any instance type is accepted, including CT machines.
{100, 199, kindExternal, instance_types.INSTANCE_TYPE_LINUX_SMALL},
{200, 299, kindExternal, instance_types.INSTANCE_TYPE_LINUX_MEDIUM},
{300, 399, kindExternal, instance_types.INSTANCE_TYPE_LINUX_LARGE},
{400, 404, kindExternal, instance_types.INSTANCE_TYPE_LINUX_SKYLAKE},
{405, 408, kindExternal, instance_types.INSTANCE_TYPE_LINUX_AMD},
{409, 499, kindExternal, instance_types.INSTANCE_TYPE_LINUX_SKYLAKE},
{500, 599, kindExternal, instance_types.INSTANCE_TYPE_WIN_MEDIUM},
{600, 699, kindExternal, instance_types.INSTANCE_TYPE_WIN_LARGE},
// skia-i-gce-* machines.
{100, 199, kindInternal, instance_types.INSTANCE_TYPE_LINUX_SMALL},
{200, 299, kindInternal, instance_types.INSTANCE_TYPE_LINUX_LARGE},
// skia-d-gce-* machines.
{100, 100, kindDev, instance_types.INSTANCE_TYPE_LINUX_SMALL}, // Exactly one machine.
{101, 599, kindDev, instance_types.INSTANCE_TYPE_LINUX_MICRO},
}
type vmsToCreate struct {
vms []*gce.Instance
zone string
project string
configuredViaAnsible bool
}
func makeVMsToCreate(ctx context.Context, kind, instanceType string, forceInstanceType bool, instanceNums []int) (vmsToCreate, error) {
if !util.In(kind, instanceKinds) {
return vmsToCreate{}, skerr.Fmt("Unknown kind: %s", kind)
}
retval := vmsToCreate{
zone: gce.ZONE_DEFAULT,
project: gce.PROJECT_ID_SWARMING,
// CT instances are the only ones that we do not configure via Ansible.
configuredViaAnsible: instanceType != instance_types.INSTANCE_TYPE_CT,
}
var getInstance func(int) *gce.Instance
switch instanceType {
case instance_types.INSTANCE_TYPE_CT:
getInstance = func(num int) *gce.Instance { return instance_types.SkiaCT(num) }
case instance_types.INSTANCE_TYPE_LINUX_MICRO:
getInstance = func(num int) *gce.Instance { return instance_types.LinuxMicro(num) }
case instance_types.INSTANCE_TYPE_LINUX_SMALL:
getInstance = func(num int) *gce.Instance { return instance_types.LinuxSmall(num) }
case instance_types.INSTANCE_TYPE_LINUX_MEDIUM:
getInstance = func(num int) *gce.Instance { return instance_types.LinuxMedium(num) }
case instance_types.INSTANCE_TYPE_LINUX_LARGE:
getInstance = func(num int) *gce.Instance { return instance_types.LinuxLarge(num) }
case instance_types.INSTANCE_TYPE_LINUX_AMD:
retval.zone = gce.ZONE_AMD
getInstance = func(num int) *gce.Instance { return instance_types.LinuxAmd(num) }
case instance_types.INSTANCE_TYPE_LINUX_SKYLAKE:
retval.zone = gce.ZONE_SKYLAKE
getInstance = func(num int) *gce.Instance { return instance_types.LinuxSkylake(num) }
case instance_types.INSTANCE_TYPE_WIN_MEDIUM:
getInstance = func(num int) *gce.Instance {
instance, err := instance_types.WinMedium(ctx, num)
if err != nil {
sklog.Fatal(err)
}
return instance
}
case instance_types.INSTANCE_TYPE_WIN_LARGE:
getInstance = func(num int) *gce.Instance {
instance, err := instance_types.WinLarge(ctx, num)
if err != nil {
sklog.Fatal(err)
}
return instance
}
default:
// Should never happen.
panic(fmt.Sprintf("Unknown instanceType: %q. This is a bug.", instanceType))
}
if kind == kindInternal {
retval.project = gce.PROJECT_ID_INTERNAL_SWARMING
getInstanceInner := getInstance
getInstance = func(num int) *gce.Instance {
return instance_types.Internal(getInstanceInner(num))
}
} else if kind == kindDev {
getInstanceInner := getInstance
getInstance = func(num int) *gce.Instance {
return instance_types.Dev(getInstanceInner(num))
}
}
// Validate that the given instance type and machine numbers correspond with the per-type range
// assignment.
if !forceInstanceType {
for _, instanceNum := range instanceNums {
found := false
for _, r := range validMachineRanges {
if r.matchesNumberAndKind(instanceNum, kind) {
found = true
// Empty string means any instance type is valid.
if r.expectedInstanceType != "" && instanceType != r.expectedInstanceType {
return vmsToCreate{}, skerr.Fmt("Machine number %d is expected to be of instance type %s. To force a different type, re-run with --force-type.", instanceNum, r.expectedInstanceType)
}
break
}
}
if !found {
return vmsToCreate{}, skerr.Fmt("Machine number %d is not in any known machine range, and thus its expected instance type cannot be determined. To proceed anyway, re-run with --force-type.", instanceNum)
}
}
}
// Create the Instance objects.
for _, num := range instanceNums {
retval.vms = append(retval.vms, getInstance(num))
}
return retval, nil
}
func main() {
var (
// Flags.
instances = flag.String("instances", "", "Which instances to create/delete, eg. \"2,3-10,22\"")
create = flag.Bool("create", false, "Create the instance. Either --create or --delete is required.")
delete = flag.Bool("delete", false, "Delete the instance. Either --create or --delete is required.")
deleteDataDisk = flag.Bool("delete-data-disk", false, "Delete the data disk. Only valid with --delete")
dev = flag.Bool("dev", false, "Whether or not the bots connect to chromium-swarm-dev.")
dumpJson = flag.Bool("dump-json", false, "Dump out JSON for each of the instances to create/delete and exit without changing anything.")
ignoreExists = flag.Bool("ignore-exists", false, "Do not fail out when creating a resource which already exists or deleting a resource which does not exist.")
instanceType = flag.String("type", "", fmt.Sprintf("Type of instance; one of: %v", validInstanceTypes))
forceInstanceType = flag.Bool("force-type", false, "Skip validation of instance types by machine number.")
internal = flag.Bool("internal", false, "Whether or not the bots are internal.")
skipUpdateSSHGCEConfig = flag.Bool("skip-update-ssh-gce-config", false, "Do not update //skolo/ansible/ssh.cfg after creating or deleting GCE machines.")
skipAnsiblePlaybook = flag.Bool("skip-ansible-playbook", false, "Do not run the corresponding Ansible playbook to finish setting up the newly created machines.")
)
common.Init()
// Validation.
if *create == *delete {
sklog.Fatal("Please specify --create or --delete, but not both.")
}
if !util.In(*instanceType, validInstanceTypes) {
sklog.Fatalf("--type must be one of %v", validInstanceTypes)
}
instanceNums, err := util.ParseIntSet(*instances)
if err != nil {
sklog.Fatal(err)
}
if len(instanceNums) == 0 {
sklog.Fatal("Please specify at least one instance number via --instances.")
}
if *dev && *internal {
sklog.Fatalf("At most one of --dev or --internal must be set.")
}
kind := kindExternal
if *internal {
kind = kindInternal
}
if *dev {
kind = kindDev
}
// Get the list of VMs to create.
ctx := context.Background()
vmsToCreate, err := makeVMsToCreate(ctx, kind, *instanceType, *forceInstanceType, instanceNums)
if err != nil {
sklog.Fatal(err)
}
// Create the GCloud object.
g, err := gce.NewLocalGCloud(vmsToCreate.project, vmsToCreate.zone)
if err != nil {
sklog.Fatal(err)
}
if err := g.CheckSsh(); err != nil {
sklog.Fatal(err)
}
// If requested, dump JSON for the given instances and exit.
if *dumpJson {
verb := "create"
if *delete {
verb = "delete"
}
data := map[string][]*gce.Instance{
verb: vmsToCreate.vms,
}
b, err := json.MarshalIndent(data, "", " ")
if err != nil {
sklog.Fatal(err)
}
sklog.Infof("\n%s", string(b))
return
}
// Compute absolute path to the Ansible directory.
_, filename, _, _ := runtime.Caller(0)
repoRoot := filepath.Join(path.Dir(filename), "..", "..", "..")
absAnsibleDir := filepath.Join(repoRoot, ansibleDirectory)
// If the machines are configured via Ansible, ensure they are included in the hosts.yml file.
if *create && vmsToCreate.configuredViaAnsible && !*skipAnsiblePlaybook {
sklog.Info("Querying Ansible inventory file.")
inventoryFile := filepath.Join(absAnsibleDir, "hosts.yml")
stdout := &bytes.Buffer{}
if err := exec.Run(
ctx,
&exec.Command{
Name: "ansible",
Args: []string{"--inventory", inventoryFile, "--list-hosts", "all"},
Dir: absAnsibleDir,
Stdout: stdout,
},
); err != nil {
sklog.Fatal(err)
}
// Contains one line per machine in //skolo/ansible/hosts.yml. Host ranges are expanded into
// individual hosts, e.g. "skia-e-gce-[100:199]" is expanded into "skia-e-gce-100",
// "skia-e-gce-101", "skia-e-gce-102", ...
inventory := stdout.String()
for _, vm := range vmsToCreate.vms {
if !strings.Contains(inventory, vm.Name+"\n") {
sklog.Fatalf("Ansible inventory file %s does not contain an entry for machine %q. Please update the inventory file, then re-run this command.", inventoryFile, vm.Name)
}
}
}
// Perform the requested operation.
verb := "Creating"
if *delete {
verb = "Deleting"
}
sklog.Infof("%s instances: %v", verb, instanceNums)
group := util.NewNamedErrGroup()
for _, vm := range vmsToCreate.vms {
vm := vm // https://golang.org/doc/faq#closures_and_goroutines
group.Go(vm.Name, func() error {
if *create {
if err := g.CreateAndSetup(ctx, vm, *ignoreExists); err != nil {
return err
}
} else {
return g.Delete(vm, *ignoreExists, *deleteDataDisk)
}
return nil
})
}
if err := group.Wait(); err != nil {
sklog.Fatal(err)
}
// Update //skolo/ansible/ssh.cfg with the public IPs of all GCE machines.
if *create && vmsToCreate.configuredViaAnsible && !*skipUpdateSSHGCEConfig {
sklog.Info("Updating //skolo/ansible/ssh.cfg")
if err := exec.Run(
ctx,
&exec.Command{
Name: "make",
Args: []string{"update_ssh_gce_config"},
Dir: absAnsibleDir,
Stdout: os.Stdout,
Stderr: os.Stderr,
},
); err != nil {
sklog.Fatal(err)
}
}
// Run Ansible playbook if necessary.
if *create && vmsToCreate.configuredViaAnsible && !*skipAnsiblePlaybook {
playbook := linuxAnsiblePlaybook
if util.In(*instanceType, winInstanceTypes) {
playbook = winAnsiblePlaybook
}
absPlaybookPath := filepath.Join(absAnsibleDir, playbook)
var machines []string
for _, n := range instanceNums {
machines = append(machines, fmt.Sprintf("skia-e-gce-%03d", n))
}
commaSeparatedMachines := strings.Join(machines, ",")
sklog.Infof("Running %s Ansible playbook...", absPlaybookPath)
cmd := &exec.Command{
Name: "ansible-playbook",
Args: []string{absPlaybookPath, "--limit", commaSeparatedMachines},
Dir: absAnsibleDir,
Stdout: os.Stdout,
Stderr: os.Stderr,
}
if util.In(*instanceType, winInstanceTypes) {
// For some reason, sometimes passwordless auth does not work on Windows machines.
cmd.Args = append(cmd.Args, "--ask-pass")
}
if err := exec.Run(ctx, cmd); err != nil {
sklog.Fatal(err)
}
}
}