blob: a5b158a36d6fa1eb429d194b0988e79c40e0e1df [file] [log] [blame]
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"os"
"sort"
"strings"
"sync"
"time"
"cloud.google.com/go/datastore"
"go.skia.org/infra/go/common"
"go.skia.org/infra/go/httputils"
"go.skia.org/infra/go/sklog"
"go.skia.org/infra/go/swarming"
"go.skia.org/infra/task_scheduler/go/db/firestore"
"go.skia.org/infra/task_scheduler/go/specs"
"go.skia.org/infra/task_scheduler/go/types"
"golang.org/x/oauth2/google"
)
const (
BOT_GROUP_TMPL = "bot-group:%s"
BOT_NAME_TMPL = "skia-d-gce-%s"
BOT_SECTION_TMPL = `bot_group {
bot_id: "%s"
owners: "skiabot@google.com"
auth {
require_service_account: "chromium-swarm-bots@skia-swarming-bots.iam.gserviceaccount.com"
}
system_service_account: "pool-skia@chromium-swarm-dev.iam.gserviceaccount.com"
dimensions: "pool:Skia"
%s
bot_config_script: "skia.py"
}
`
)
var (
from = flag.String("from", "", "Root dir of source repo.")
to = flag.String("to", "", "Root dir of destination repo.")
botsCfg = flag.String("bots_cfg", "", "Name of file to write partial bot config data.")
fsInstance = flag.String("firestore_instance", "", "Firestore instance to use, eg. \"production\"")
now = flag.Int("now", int(time.Now().Unix()), "Current timestamp; use to make this script reproducible.")
dimensions = []string{
"pool:Skia",
}
)
func main() {
common.Init()
nowTs := time.Unix(int64(*now), 0)
includeDimensions := map[string]bool{}
ctx := context.Background()
ts, err := google.DefaultTokenSource(ctx, datastore.ScopeDatastore, swarming.AUTH_SCOPE)
if err != nil {
sklog.Fatal(err)
}
var cfgB *specs.TasksCfg
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
// Obtain average task durations for the last 5 days.
db, err := firestore.NewDBWithParams(ctx, firestore.FIRESTORE_PROJECT, *fsInstance, ts)
if err != nil {
sklog.Fatal(err)
}
tasks, err := db.GetTasksFromDateRange(ctx, nowTs.Add(-5*24*time.Hour), nowTs, common.REPO_SKIA)
if err != nil {
sklog.Fatal(err)
}
durations := map[string][]time.Duration{}
for _, task := range tasks {
if task.Status != types.TASK_STATUS_SUCCESS {
continue
}
duration := task.Finished.Sub(task.Started)
if duration < time.Duration(0) {
continue
}
durations[task.Name] = append(durations[task.Name], duration)
}
avgDurations := make(map[string]time.Duration, len(durations))
for name, durs := range durations {
total := time.Duration(0)
for _, d := range durs {
total += d
}
avgDurations[name] = total / time.Duration(len(durs))
}
cfgA, err := specs.ReadTasksCfg(*from)
if err != nil {
sklog.Fatal(err)
}
cfgB = &specs.TasksCfg{
Jobs: make(map[string]*specs.JobSpec, len(cfgA.Jobs)),
Tasks: make(map[string]*specs.TaskSpec, len(cfgA.Tasks)),
CasSpecs: map[string]*specs.CasSpec{
"infrabots": {
Paths: []string{"infra/bots"},
Root: ".",
},
},
}
for name, jobSpec := range cfgA.Jobs {
// Leave the JobSpecs the same.
cfgB.Jobs[name] = jobSpec
}
taskNames := make([]string, 0, len(cfgA.Tasks))
for name := range cfgA.Tasks {
taskNames = append(taskNames, name)
}
sort.Strings(taskNames)
for _, name := range taskNames {
taskSpec := cfgA.Tasks[name]
taskSpec.Caches = nil
taskSpec.CipdPackages = nil
avgDuration := int64(avgDurations[name].Seconds())
if avgDuration == 0 {
sklog.Errorf("No average duration for %s!", name)
avgDuration = 10
}
taskSpec.Command = []string{"/bin/bash", "dummy.sh", fmt.Sprintf("%d", avgDuration)}
if len(taskSpec.Outputs) > 0 {
taskSpec.Command = append(taskSpec.Command, taskSpec.Outputs...)
} else {
taskSpec.Command = append(taskSpec.Command, "${ISOLATED_OUTDIR}")
}
for _, dim := range taskSpec.Dimensions {
split := strings.SplitN(dim, ":", 2)
if len(split) != 2 {
sklog.Fatalf("Invalid dimension: %s", dim)
}
includeDimensions[split[0]] = true
}
taskSpec.EnvPrefixes = nil
taskSpec.ExtraTags = nil
taskSpec.CasSpec = "infrabots"
taskSpec.ServiceAccount = ""
cfgB.Tasks[name] = taskSpec
}
}()
// Set up Swarming client.
swarmClient := httputils.DefaultClientConfig().WithTokenSource(ts).WithDialTimeout(3 * time.Minute).With2xxOnly().Client()
swarm, err := swarming.NewApiClient(swarmClient, "chromium-swarm.appspot.com")
if err != nil {
sklog.Fatal(err)
}
bots, err := swarm.ListBotsForPool(ctx, "Skia")
if err != nil {
sklog.Fatal(err)
}
sklog.Infof("Found %d bots", len(bots))
wg.Wait()
// botGroups maps dimension sets to bot group IDs. The dimension set is
// just a concatenation of bot dimensions.
botGroups := map[string]string{}
// botGroupIds simply lists the bot group IDs.
botGroupIds := []string{}
// numBots maps bot group IDs to the number of bots in each group.
numBots := map[string]int{}
// dimsByGroup maps a bot group ID (derived from dimensions) to a
// 2-level map of the dimensions themselves: map[key]map[value]true.
// This makes it easy to check whether a bot group can run a given task.
dimsByGroup := map[string]map[string]map[string]bool{}
sort.Slice(bots, func(i, j int) bool { return bots[i].BotId < bots[j].BotId })
for _, bot := range bots {
// botDims maps dimension keys to dimension values, using a
// sub-map so that we can check for the existence of specific
// values.
botDims := map[string]map[string]bool{}
subKeys := []string{}
for _, dim := range bot.Dimensions {
if includeDimensions[dim.Key] {
vals := make([]string, 0, len(dim.Value))
valsMap := make(map[string]bool, len(dim.Value))
for _, val := range dim.Value {
vals = append(vals, val)
valsMap[val] = true
}
sort.Strings(vals)
subKeys = append(subKeys, fmt.Sprintf("%s:%s", dim.Key, strings.Join(vals, ",")))
botDims[dim.Key] = valsMap
}
}
sort.Strings(subKeys)
dimSetKey := strings.Join(subKeys, ";")
groupId, ok := botGroups[dimSetKey]
if !ok {
groupId = fmt.Sprintf("%03d", len(botGroups))
botGroups[dimSetKey] = groupId
botGroupIds = append(botGroupIds, groupId)
b, err := json.MarshalIndent(botDims, "", " ")
if err != nil {
sklog.Fatal(err)
}
sklog.Infof("Group %s:\n%s", groupId, b)
}
numBots[groupId]++
dimsByGroup[groupId] = botDims
}
sklog.Infof("Found %d sets of bots with shared dimensions.", len(botGroups))
// Now, match the task specs up to bot groups.
used := map[string]bool{}
for name, t := range cfgB.Tasks {
groups := []string{}
for _, groupId := range botGroupIds {
botDims := dimsByGroup[groupId]
canHandle := true
for _, dim := range t.Dimensions {
split := strings.SplitN(dim, ":", 2)
if len(split) != 2 {
sklog.Fatalf("Invalid dimension: %s", dim)
}
if vals, ok := botDims[split[0]]; !ok || !vals[split[1]] {
canHandle = false
break
}
}
if canHandle {
groups = append(groups, groupId)
}
}
// We don't know how to specify that a dimension could be one of
// two different values, so we can't specify the "bot-group"
// dimension to mean one of a set of groups. Just pick the first.
if len(groups) == 0 {
sklog.Errorf("No bots can run %s", name)
} else {
t.Dimensions = append(dimensions, fmt.Sprintf(BOT_GROUP_TMPL, groups[0]))
used[groups[0]] = true
if len(groups) > 1 {
sklog.Infof("Have %d groups but chose %s; %v", len(groups), groups[0], groups)
}
}
}
if err := specs.WriteTasksCfg(cfgB, *to); err != nil {
sklog.Fatal(err)
}
// Create sets of new bots with the dimension sets from above.
botIdStart := 100 // To avoid issues with zero-padding.
rangeStart := botIdStart
botCfgData := ""
for _, groupId := range botGroupIds {
if !used[groupId] {
sklog.Infof("Unused group: %s", groupId)
continue
}
dimensions := fmt.Sprintf(" dimensions: \"%s\"\n", fmt.Sprintf(BOT_GROUP_TMPL, groupId))
n := numBots[groupId]
rangeStr := fmt.Sprintf("{%03d..%03d}", rangeStart, rangeStart+n-1)
if len(bots) == 1 {
rangeStr = fmt.Sprintf("%03d", rangeStart)
}
botSection := fmt.Sprintf(BOT_SECTION_TMPL, fmt.Sprintf(BOT_NAME_TMPL, rangeStr), dimensions)
botCfgData += botSection
rangeStart += n
}
if err := os.WriteFile(*botsCfg, []byte(botCfgData), os.ModePerm); err != nil {
sklog.Fatal(err)
}
sklog.Infof("Create bots with:\n$ go run ./go/gce/swarming/swarming_vm.go --dev --create --type=linux-micro --instances=%d-%d", botIdStart, rangeStart-1)
}