blob: 2705bb92f984aba6ee52d9d00918d15a8ee114cd [file] [log] [blame]
// Package cq provides tools for interacting with the CQ tools.
package cq
import (
"context"
"fmt"
"sync"
"time"
"github.com/golang/protobuf/ptypes"
buildbucketpb "go.chromium.org/luci/buildbucket/proto"
"go.skia.org/infra/go/common"
"go.skia.org/infra/go/gerrit"
"go.skia.org/infra/go/git"
"go.skia.org/infra/go/git/git_common"
"go.skia.org/infra/go/gitiles"
"go.skia.org/infra/go/metrics2"
"go.skia.org/infra/go/skerr"
"go.skia.org/infra/go/sklog"
"go.skia.org/infra/go/util"
"go.skia.org/infra/task_scheduler/go/specs"
)
const (
MAIN_REF = git_common.RefsHeadsPrefix + git.MainBranch
// Constants for in-flight metrics.
INFLIGHT_METRIC_NAME = "in_flight"
INFLIGHT_TRYBOT_DURATION = "trybot_duration"
INFLIGHT_TRYBOT_NUM = "trybot_num"
INFLIGHT_WAITING_IN_CQ = "waiting_in_cq"
// Constants for landed metrics.
LANDED_METRIC_NAME = "after_commit"
LANDED_TRYBOT_DURATION = "trybot_duration"
LANDED_TOTAL_DURATION = "total_duration"
)
var (
// Slice of all known presubmit bot names.
PRESUBMIT_BOTS = []string{"skia_presubmit-Trybot"}
// Mutext to control access to the slice of CQ trybots.
cqTryBotsMutex sync.RWMutex
)
// NewClient creates a new client for interacting with CQ tools.
func NewClient(gerritClient *gerrit.Gerrit, cqTryBotsFunc GetCQTryBotsFn, metricName string) (*Client, error) {
cqTryBots, err := cqTryBotsFunc()
if err != nil {
return nil, err
}
return &Client{gerritClient, util.NewStringSet(cqTryBots), cqTryBotsFunc, metricName}, err
}
// GetCQTryBotsFn is an interface for returing the CQ trybots of a project.
type GetCQTryBotsFn func() ([]string, error)
type Client struct {
gerritClient *gerrit.Gerrit
cqTryBots util.StringSet
cqTryBotsFunc GetCQTryBotsFn
metricName string
}
// GetSkiaCQTryBots is a Skia implementation of GetCQTryBotsFn.
func GetSkiaCQTryBots() ([]string, error) {
cfg, err := GetCQJobsToConfig(gitiles.NewRepo(common.REPO_SKIA, nil), MAIN_REF)
if err != nil {
return nil, err
}
return GetCQTryBots(cfg)
}
// GetSkiaInfraCQTryBots is a Skia Infra implementation of GetCQTryBotsFn.
func GetSkiaInfraCQTryBots() ([]string, error) {
cfg, err := GetCQJobsToConfig(gitiles.NewRepo(common.REPO_SKIA_INFRA, nil), MAIN_REF)
if err != nil {
return nil, err
}
return GetCQTryBots(cfg)
}
// GetCQJobsToConfig returns the Config for the given repo.
func GetCQJobsToConfig(repo *gitiles.Repo, ref string) (map[string]*specs.CommitQueueJobConfig, error) {
contents, err := repo.ReadFileAtRef(context.Background(), specs.TASKS_CFG_FILE, ref)
if err != nil {
return nil, err
}
tasksCfg, err := specs.ParseTasksCfg(string(contents))
if err != nil {
return nil, skerr.Wrapf(err, "Error when parsing tasks.json cfg")
}
return tasksCfg.CommitQueue, nil
}
// GetCQTryBots is a convenience method for retrieving the list of CQ trybots
// from a Config.
func GetCQTryBots(cqJobsToConfig map[string]*specs.CommitQueueJobConfig) ([]string, error) {
tryJobs := []string{}
for tryJob := range cqJobsToConfig {
if util.ContainsAny(tryJob, PRESUBMIT_BOTS) {
// Exclude presubmit bots because they could fail or be delayed
// due to factors such as owners approval and other project
// specific checks.
continue
}
tryJobs = append(tryJobs, tryJob)
}
sklog.Infof("The list of CQ trybots is: %s", tryJobs)
return tryJobs, nil
}
// RefreshCQTryBots refreshes the slice of CQ trybots on the instance. Access
// to the trybots is protected by a RWMutex.
func (c *Client) RefreshCQTryBots() error {
tryBots, err := c.cqTryBotsFunc()
if err != nil {
return err
}
cqTryBotsMutex.Lock()
defer cqTryBotsMutex.Unlock()
c.cqTryBots = util.NewStringSet(tryBots)
return nil
}
// ReportCQStats reports all relevant stats for the specified Gerrit change.
// Note: Different stats are reported depending on whether the change has been
// merged or not.
// All created metrics will be registered in reportedMetrics.
func (c *Client) ReportCQStats(ctx context.Context, change int64, reportedMetrics map[metrics2.Int64Metric]struct{}) error {
changeInfo, err := c.gerritClient.GetIssueProperties(ctx, change)
if err != nil {
return err
}
patchsetIds := changeInfo.GetPatchsetIDs()
latestPatchsetId := patchsetIds[len(patchsetIds)-1]
if changeInfo.Committed {
// TODO(rmistry): The last patchset in Gerrit does not contain trybot
// information so we have to look at the one immediately before it.
// This will be fixed with crbug.com/634944.
latestPatchsetId = patchsetIds[len(patchsetIds)-2]
}
builds, err := c.gerritClient.GetTrybotResults(ctx, change, latestPatchsetId)
if err != nil {
return err
}
// Consider only CQ bots.
cqBuilds := []*buildbucketpb.Build{}
for _, b := range builds {
if c.isCQTryBot(b.Builder.Builder) {
cqBuilds = append(cqBuilds, b)
}
}
gerritURL := fmt.Sprintf("%s/c/%d/%d", gerrit.GerritSkiaURL, change, latestPatchsetId)
if len(cqBuilds) == 0 {
sklog.Infof("No trybot results were found for %s", gerritURL)
return nil
}
sklog.Infof("Starting processing %s. Merged status: %t", gerritURL, changeInfo.Committed)
if changeInfo.Committed {
c.ReportCQStatsForLandedCL(cqBuilds, gerritURL, reportedMetrics)
} else {
c.ReportCQStatsForInFlightCL(cqBuilds, gerritURL, reportedMetrics)
}
return nil
}
// ReportCQStatsForLandedCL reports the following metrics for the specified
// change and patchsetID:
// * The total time the change spent waiting for CQ trybots to complete.
// * The time each CQ trybot took to complete.
// All created metrics will be registered in reportedMetrics.
func (c *Client) ReportCQStatsForLandedCL(cqBuilds []*buildbucketpb.Build, gerritURL string, reportedMetrics map[metrics2.Int64Metric]struct{}) {
endTimeOfCQBots := time.Time{}
maximumTrybotDuration := int64(0)
for _, b := range cqBuilds {
createdTime, err := ptypes.Timestamp(b.CreateTime)
if err != nil {
sklog.Errorf("Failed to convert timestamp for %d; skipping: %s", b.Id, err)
continue
}
createdTime = createdTime.UTC()
if b.EndTime == nil {
sklog.Warningf("Skipping %s on %s. The correct completed time has not shown up in Buildbucket yet.", b.Builder.Builder, gerritURL)
continue
}
completedTime, err := ptypes.Timestamp(b.EndTime)
if err != nil {
sklog.Errorf("Failed to convert timestamp for %d; skipping: %s", b.Id, err)
continue
}
completedTime = completedTime.UTC()
if endTimeOfCQBots.Before(completedTime) {
endTimeOfCQBots = completedTime
}
duration := int64(completedTime.Sub(createdTime).Seconds())
sklog.Infof("%s was created at %s by %s and completed at %s. Total duration: %d", b.Builder.Builder, createdTime, gerritURL, completedTime, duration)
landedTrybotDurationMetric := c.getLandedTrybotDurationMetric(b.Builder.Builder, gerritURL)
landedTrybotDurationMetric.Update(duration)
reportedMetrics[landedTrybotDurationMetric] = struct{}{}
if duration > maximumTrybotDuration {
maximumTrybotDuration = duration
}
}
sklog.Infof("Maximum trybot duration for %s: %d", gerritURL, maximumTrybotDuration)
sklog.Infof("Furthest completion time for %s: %s", gerritURL, endTimeOfCQBots)
landedTotalDurationMetric := metrics2.GetInt64Metric(fmt.Sprintf("%s_%s_%s", c.metricName, LANDED_METRIC_NAME, LANDED_TOTAL_DURATION), map[string]string{"gerritURL": gerritURL})
landedTotalDurationMetric.Update(maximumTrybotDuration)
reportedMetrics[landedTotalDurationMetric] = struct{}{}
}
// ReportCQStatsForInFlightCL reports the following metrics for the specified
// change and patchsetID:
// * How long CQ trybots have been running for.
// * How many CQ trybots have been triggered.
// All created metrics will be registered in reportedMetrics.
func (c *Client) ReportCQStatsForInFlightCL(cqBuilds []*buildbucketpb.Build, gerritURL string, reportedMetrics map[metrics2.Int64Metric]struct{}) {
totalTriggeredCQBots := int(0)
currentTime := time.Now()
for _, b := range cqBuilds {
totalTriggeredCQBots++
createdTime, err := ptypes.Timestamp(b.CreateTime)
if err != nil {
sklog.Errorf("Failed to convert timestamp for %d; skipping: %s", b.Id, err)
continue
}
createdTime = createdTime.UTC()
if b.EndTime != nil {
if time.Hour*24 < time.Now().UTC().Sub(createdTime) {
// The build was created more than a day ago. Do not include it
// in totalTriggeredCQBots. See skbug.com/7340.
// Creation time is used above instead of completion time because
// that is what CQ does:
// https://chrome-internal.googlesource.com/infra/infra_internal/+show/master/infra_internal/services/cq/verification/tryjob_utils.py#1271
totalTriggeredCQBots--
}
// The build has completed so move on.
continue
}
duration := int64(currentTime.Sub(createdTime).Seconds())
inflightTrybotDurationMetric := c.getInflightTrybotDurationMetric(b.Builder.Builder, gerritURL)
inflightTrybotDurationMetric.Update(duration)
reportedMetrics[inflightTrybotDurationMetric] = struct{}{}
}
cqTryBotsMutex.RLock()
cqTryBotsMutex.RUnlock()
trybotNumDurationMetric := metrics2.GetInt64Metric(fmt.Sprintf("%s_%s_%s", c.metricName, INFLIGHT_METRIC_NAME, INFLIGHT_TRYBOT_NUM), map[string]string{"gerritURL": gerritURL})
trybotNumDurationMetric.Update(int64(totalTriggeredCQBots))
reportedMetrics[trybotNumDurationMetric] = struct{}{}
}
func (c *Client) getInflightTrybotDurationMetric(tryBot, gerritURL string) metrics2.Int64Metric {
metricName := fmt.Sprintf("%s_%s_%s", c.metricName, INFLIGHT_METRIC_NAME, INFLIGHT_TRYBOT_DURATION)
tags := map[string]string{
"trybot": tryBot,
"gerritURL": gerritURL,
}
return metrics2.GetInt64Metric(metricName, tags)
}
func (c *Client) getLandedTrybotDurationMetric(tryBot, gerritURL string) metrics2.Int64Metric {
metricName := fmt.Sprintf("%s_%s_%s", c.metricName, LANDED_METRIC_NAME, LANDED_TRYBOT_DURATION)
tags := map[string]string{
"trybot": tryBot,
"gerritURL": gerritURL,
}
return metrics2.GetInt64Metric(metricName, tags)
}
func (c *Client) isCQTryBot(builderName string) bool {
cqTryBotsMutex.RLock()
isCQTrybot := c.cqTryBots[builderName]
cqTryBotsMutex.RUnlock()
return isCQTrybot
}