blob: 415910efbcbe83267ed26c49e39194527ae4ef95 [file] [log] [blame]
package analysis
/*
Find flaky tasks within the given time range.
*/
import (
"regexp"
"strings"
"time"
"go.skia.org/infra/go/git/repograph"
"go.skia.org/infra/go/sklog"
"go.skia.org/infra/go/util"
"go.skia.org/infra/task_scheduler/go/db"
)
var (
// revertRE is a regular expression used for detecting a commit which
// reverts another commit.
revertRE = regexp.MustCompile("This reverts commit ([a-fA-F0-9]{40})")
)
// Results is a struct which contains results from all types of flakiness
// analysis for a given TaskSpec within a repo.
type Results struct {
DefinitelyFlaky []*Flake
InfraFailures []*Flake
MaybeFlaky []*Flake
}
// Flake is a struct representing a particular instance of a flake,
// illustrated by a sequence of tasks.
type Flake struct {
Tasks []*db.Task
}
// Analyze loads tasks from the given time period, finds various types of
// flakes, and returns a map[repo URL]map[task spec name]*Results.
func Analyze(d db.TaskReader, start, end time.Time, repos repograph.Map) (map[string]map[string]*Results, error) {
// Obtain all tasks from the time range.
tasks, err := d.GetTasksFromDateRange(start, end)
if err != nil {
return nil, err
}
// Organize by repo and task spec.
byTaskSpec := make(map[string]map[string][]*db.Task, len(repos))
for _, t := range tasks {
// Filter by repo, since the DB might contain tasks from repos
// we don't care about.
if _, ok := repos[t.Repo]; !ok {
continue
}
m, ok := byTaskSpec[t.Repo]
if !ok {
m = map[string][]*db.Task{}
byTaskSpec[t.Repo] = m
}
m[t.Name] = append(m[t.Name], t)
}
// Find flakes for repos+taskspecs.
rv := make(map[string]map[string]*Results, len(repos))
for repoUrl, taskSpecs := range byTaskSpec {
m := make(map[string]*Results, len(taskSpecs))
for taskSpec, tasks := range taskSpecs {
def := DefinitelyFlaky(tasks)
inf := InfraFailures(tasks)
may := MaybeFlaky(tasks, start, end, repos[repoUrl])
if len(def) > 0 || len(inf) > 0 || len(may) > 0 {
m[taskSpec] = &Results{
DefinitelyFlaky: def,
InfraFailures: inf,
MaybeFlaky: may,
}
}
}
if len(m) > 0 {
rv[repoUrl] = m
}
}
return rv, nil
}
// InfraFailures finds infrastructure-related failures in the given set of tasks.
func InfraFailures(tasks []*db.Task) []*Flake {
rv := []*Flake{}
for _, t := range tasks {
if t.Status == db.TASK_STATUS_MISHAP {
rv = append(rv, &Flake{
[]*db.Task{t},
})
}
}
return rv
}
// DefinitelyFlaky finds tasks which failed and whose retries succeeded.
func DefinitelyFlaky(tasks []*db.Task) []*Flake {
// Organize the tasks by ID so that we can find them later.
byId := make(map[string]*db.Task, len(tasks))
for _, t := range tasks {
byId[t.Id] = t
}
rv := []*Flake{}
for _, t := range tasks {
// If an initial attempt failed but its retry succeeded,
// it's definitely flaky.
if t.RetryOf != "" && t.Success() {
orig, ok := byId[t.RetryOf]
if !ok {
sklog.Warningf("Failed to retrieve task %q: not in range", t.RetryOf)
continue
}
if orig.Status == db.TASK_STATUS_MISHAP {
// Infra failures are handled elsewhere.
continue
}
rv = append(rv, &Flake{
[]*db.Task{orig, t},
})
}
}
return rv
}
// findTaskSequences finds sequences of N consecutive (commit-wise) tasks.
func findTaskSequences(n int, tasks []*db.Task, start, end time.Time, repo *repograph.Graph) [][]*db.Task {
// First, organize tasks by blamelist.
byBlamelist := make(map[string]*db.Task, len(tasks))
for _, t := range tasks {
for _, c := range t.Commits {
byBlamelist[c] = t
}
}
// Traverse the commit history, obtain all tasks.
sequences := [][]*db.Task{}
visited := map[*db.Task]bool{}
for _, b := range repo.Branches() {
seq := make([]*db.Task, 0, n)
if err := repo.Get(b).Recurse(func(c *repograph.Commit) (bool, error) {
if start.After(c.Timestamp) {
return false, nil
}
// Keep recursing in this case; we haven't gotten to the commits we care about.
if c.Timestamp.After(end) {
return true, nil
}
task, ok := byBlamelist[c.Hash]
if !ok {
return true, nil
}
// Throw out unfinished tasks and those outside the window.
if task.Created.After(end) || start.After(task.Created) || !task.Done() {
return true, nil
}
// Multiple commits in a row may share the same task.
var last *db.Task
if len(seq) > 0 {
last = seq[len(seq)-1]
}
if task == last {
return true, nil
}
// Add this task to the current sequence.
if len(seq) == cap(seq) {
copy(seq, seq[1:])
seq[cap(seq)-1] = task
} else {
seq = append(seq, task)
}
if len(seq) == cap(seq) {
// Make sure we haven't seen this sequence before.
// This can happen in the case of two diverging branches.
allVisited := true
for _, t := range seq {
if !visited[t] {
allVisited = false
}
}
if allVisited {
return false, nil
}
// Save the sequence.
cpy := make([]*db.Task, len(seq))
copy(cpy, seq)
sequences = append(sequences, cpy)
}
visited[task] = true
return true, nil
}); err != nil {
// Recurse only returns an error if the function we pass
// in returns an error. It doesn't.
sklog.Errorf("Error: %s", err)
}
}
return sequences
}
// revertOf returns the commit hash of the commit which the given commit
// reverts, or the empty string if the given commit is not a revert.
func revertOf(c *repograph.Commit) string {
for _, line := range strings.Split(c.Body, "\n") {
m := revertRE.FindStringSubmatch(line)
if m == nil {
continue
}
if len(m) != 2 {
continue
}
return m[1]
}
return ""
}
// MaybeFlaky finds sequences of tasks which display ping-ponging between
// success and failure states.
func MaybeFlaky(tasks []*db.Task, start, end time.Time, repo *repograph.Graph) []*Flake {
rv := []*Flake{}
// TODO(borenet): Should we also look for sequences of four states, eg.
// FSMF or SFMS?
sequences := findTaskSequences(3, tasks, start, end, repo)
// Find sequences which display the alternating-status behavior.
for _, seq := range sequences {
// Infra failures are handled elsewhere.
mishap := false
for _, t := range seq {
if t.Status == db.TASK_STATUS_MISHAP {
mishap = true
break
}
}
if mishap {
continue
}
// FSF or SFS.
if seq[0].Status != seq[2].Status {
continue
}
if seq[0].Status == seq[1].Status {
continue
}
// Filter out any of these where the final state change is on a
// revert. Sequences are in reverse chronological order, since
// that's how we traversed the repo.
revertedCommits := util.StringSet{}
for _, commit := range seq[0].Commits {
c := repo.Get(commit)
if c == nil {
sklog.Errorf("Unable to find commit %q in repo %q", seq[0].Revision, seq[0].Repo)
continue
}
revertedCommit := revertOf(c)
if revertedCommit != "" {
revertedCommits[revertedCommit] = true
}
}
wasRevert := len(revertedCommits.Intersect(util.NewStringSet(seq[1].Commits))) > 0
if wasRevert {
sklog.Infof("Failure was reverted: %s @ %s", seq[1].Name, seq[1].Revision)
continue
}
// Add the flake to the results.
flake := make([]*db.Task, len(seq))
copy(flake, seq)
rv = append(rv, &Flake{
Tasks: flake,
})
}
return rv
}