flakiness/go/analysis/analysis.go - buildbot - Git at Google

 package analysis

 /*
 	Find flaky tasks within the given time range.
 */

 import (
 	"regexp"
 	"strings"
 	"time"

 	"go.skia.org/infra/go/git/repograph"
 	"go.skia.org/infra/go/sklog"
 	"go.skia.org/infra/go/util"
 	"go.skia.org/infra/task_scheduler/go/db"
 )

 var (
 	// revertRE is a regular expression used for detecting a commit which
 	// reverts another commit.
 	revertRE = regexp.MustCompile("This reverts commit ([a-fA-F0-9]{40})")
 )

 // Results is a struct which contains results from all types of flakiness
 // analysis for a given TaskSpec within a repo.
 type Results struct {
 	DefinitelyFlaky []*Flake
 	InfraFailures   []*Flake
 	MaybeFlaky      []*Flake
 }

 // Flake is a struct representing a particular instance of a flake,
 // illustrated by a sequence of tasks.
 type Flake struct {
 	Tasks []*db.Task
 }

 // Analyze loads tasks from the given time period, finds various types of
 // flakes, and returns a map[repo URL]map[task spec name]*Results.
 func Analyze(d db.TaskReader, start, end time.Time, repos repograph.Map) (map[string]map[string]*Results, error) {
 	// Obtain all tasks from the time range.
 	tasks, err := d.GetTasksFromDateRange(start, end)
 	if err != nil {
 		return nil, err
 	}

 	// Organize by repo and task spec.
 	byTaskSpec := make(map[string]map[string][]*db.Task, len(repos))
 	for _, t := range tasks {
 		// Filter by repo, since the DB might contain tasks from repos
 		// we don't care about.
 		if _, ok := repos[t.Repo]; !ok {
 			continue
 		}

 		m, ok := byTaskSpec[t.Repo]
 		if !ok {
 			m = map[string][]*db.Task{}
 			byTaskSpec[t.Repo] = m
 		}
 		m[t.Name] = append(m[t.Name], t)
 	}

 	// Find flakes for repos+taskspecs.
 	rv := make(map[string]map[string]*Results, len(repos))
 	for repoUrl, taskSpecs := range byTaskSpec {
 		m := make(map[string]*Results, len(taskSpecs))
 		for taskSpec, tasks := range taskSpecs {
 			def := DefinitelyFlaky(tasks)
 			inf := InfraFailures(tasks)
 			may := MaybeFlaky(tasks, start, end, repos[repoUrl])
 			if len(def) > 0 || len(inf) > 0 || len(may) > 0 {
 				m[taskSpec] = &Results{
 					DefinitelyFlaky: def,
 					InfraFailures:   inf,
 					MaybeFlaky:      may,
 				}
 			}
 		}
 		if len(m) > 0 {
 			rv[repoUrl] = m
 		}
 	}
 	return rv, nil
 }

 // InfraFailures finds infrastructure-related failures in the given set of tasks.
 func InfraFailures(tasks []*db.Task) []*Flake {
 	rv := []*Flake{}
 	for _, t := range tasks {
 		if t.Status == db.TASK_STATUS_MISHAP {
 			rv = append(rv, &Flake{
 				[]*db.Task{t},
 			})
 		}
 	}
 	return rv
 }

 // DefinitelyFlaky finds tasks which failed and whose retries succeeded.
 func DefinitelyFlaky(tasks []*db.Task) []*Flake {
 	// Organize the tasks by ID so that we can find them later.
 	byId := make(map[string]*db.Task, len(tasks))
 	for _, t := range tasks {
 		byId[t.Id] = t
 	}

 	rv := []*Flake{}
 	for _, t := range tasks {
 		// If an initial attempt failed but its retry succeeded,
 		// it's definitely flaky.
 		if t.RetryOf != "" && t.Success() {
 			orig, ok := byId[t.RetryOf]
 			if !ok {
 				sklog.Warningf("Failed to retrieve task %q: not in range", t.RetryOf)
 				continue
 			}
 			if orig.Status == db.TASK_STATUS_MISHAP {
 				// Infra failures are handled elsewhere.
 				continue
 			}
 			rv = append(rv, &Flake{
 				[]*db.Task{orig, t},
 			})
 		}
 	}
 	return rv
 }

 // findTaskSequences finds sequences of N consecutive (commit-wise) tasks.
 func findTaskSequences(n int, tasks []*db.Task, start, end time.Time, repo *repograph.Graph) [][]*db.Task {
 	// First, organize tasks by blamelist.
 	byBlamelist := make(map[string]*db.Task, len(tasks))
 	for _, t := range tasks {
 		for _, c := range t.Commits {
 			byBlamelist[c] = t
 		}
 	}

 	// Traverse the commit history, obtain all tasks.
 	sequences := [][]*db.Task{}
 	visited := map[*db.Task]bool{}
 	for _, b := range repo.Branches() {
 		seq := make([]*db.Task, 0, n)
 		if err := repo.Get(b).Recurse(func(c *repograph.Commit) (bool, error) {
 			if start.After(c.Timestamp) {
 				return false, nil
 			}
 			// Keep recursing in this case; we haven't gotten to the commits we care about.
 			if c.Timestamp.After(end) {
 				return true, nil
 			}

 			task, ok := byBlamelist[c.Hash]
 			if !ok {
 				return true, nil
 			}
 			// Throw out unfinished tasks and those outside the window.
 			if task.Created.After(end) || start.After(task.Created) || !task.Done() {
 				return true, nil
 			}

 			// Multiple commits in a row may share the same task.
 			var last *db.Task
 			if len(seq) > 0 {
 				last = seq[len(seq)-1]
 			}
 			if task == last {
 				return true, nil
 			}

 			// Add this task to the current sequence.
 			if len(seq) == cap(seq) {
 				copy(seq, seq[1:])
 				seq[cap(seq)-1] = task
 			} else {
 				seq = append(seq, task)
 			}
 			if len(seq) == cap(seq) {
 				// Make sure we haven't seen this sequence before.
 				// This can happen in the case of two diverging branches.
 				allVisited := true
 				for _, t := range seq {
 					if !visited[t] {
 						allVisited = false
 					}
 				}
 				if allVisited {
 					return false, nil
 				}

 				// Save the sequence.
 				cpy := make([]*db.Task, len(seq))
 				copy(cpy, seq)
 				sequences = append(sequences, cpy)
 			}
 			visited[task] = true
 			return true, nil
 		}); err != nil {
 			// Recurse only returns an error if the function we pass
 			// in returns an error. It doesn't.
 			sklog.Errorf("Error: %s", err)
 		}
 	}
 	return sequences
 }

 // revertOf returns the commit hash of the commit which the given commit
 // reverts, or the empty string if the given commit is not a revert.
 func revertOf(c *repograph.Commit) string {
 	for _, line := range strings.Split(c.Body, "\n") {
 		m := revertRE.FindStringSubmatch(line)
 		if m == nil {
 			continue
 		}
 		if len(m) != 2 {
 			continue
 		}
 		return m[1]
 	}
 	return ""
 }

 // MaybeFlaky finds sequences of tasks which display ping-ponging between
 // success and failure states.
 func MaybeFlaky(tasks []*db.Task, start, end time.Time, repo *repograph.Graph) []*Flake {
 	rv := []*Flake{}

 	// TODO(borenet): Should we also look for sequences of four states, eg.
 	// FSMF or SFMS?
 	sequences := findTaskSequences(3, tasks, start, end, repo)

 	// Find sequences which display the alternating-status behavior.
 	for _, seq := range sequences {
 		// Infra failures are handled elsewhere.
 		mishap := false
 		for _, t := range seq {
 			if t.Status == db.TASK_STATUS_MISHAP {
 				mishap = true
 				break
 			}
 		}
 		if mishap {
 			continue
 		}

 		// FSF or SFS.
 		if seq[0].Status != seq[2].Status {
 			continue
 		}
 		if seq[0].Status == seq[1].Status {
 			continue
 		}

 		// Filter out any of these where the final state change is on a
 		// revert. Sequences are in reverse chronological order, since
 		// that's how we traversed the repo.
 		revertedCommits := util.StringSet{}
 		for _, commit := range seq[0].Commits {
 			c := repo.Get(commit)
 			if c == nil {
 				sklog.Errorf("Unable to find commit %q in repo %q", seq[0].Revision, seq[0].Repo)
 				continue
 			}
 			revertedCommit := revertOf(c)
 			if revertedCommit != "" {
 				revertedCommits[revertedCommit] = true
 			}
 		}
 		wasRevert := len(revertedCommits.Intersect(util.NewStringSet(seq[1].Commits))) > 0
 		if wasRevert {
 			sklog.Infof("Failure was reverted: %s @ %s", seq[1].Name, seq[1].Revision)
 			continue
 		}

 		// Add the flake to the results.
 		flake := make([]*db.Task, len(seq))
 		copy(flake, seq)
 		rv = append(rv, &Flake{
 			Tasks: flake,
 		})
 	}
 	return rv
 }
	package analysis

	/*
	Find flaky tasks within the given time range.
	*/

	import (
	"regexp"
	"strings"
	"time"

	"go.skia.org/infra/go/git/repograph"
	"go.skia.org/infra/go/sklog"
	"go.skia.org/infra/go/util"
	"go.skia.org/infra/task_scheduler/go/db"
	)

	var (
	// revertRE is a regular expression used for detecting a commit which
	// reverts another commit.
	revertRE = regexp.MustCompile("This reverts commit ([a-fA-F0-9]{40})")
	)

	// Results is a struct which contains results from all types of flakiness
	// analysis for a given TaskSpec within a repo.
	type Results struct {
	DefinitelyFlaky []*Flake
	InfraFailures []*Flake
	MaybeFlaky []*Flake
	}

	// Flake is a struct representing a particular instance of a flake,
	// illustrated by a sequence of tasks.
	type Flake struct {
	Tasks []*db.Task
	}

	// Analyze loads tasks from the given time period, finds various types of
	// flakes, and returns a map[repo URL]map[task spec name]*Results.
	func Analyze(d db.TaskReader, start, end time.Time, repos repograph.Map) (map[string]map[string]*Results, error) {
	// Obtain all tasks from the time range.
	tasks, err := d.GetTasksFromDateRange(start, end)
	if err != nil {
	return nil, err
	}

	// Organize by repo and task spec.
	byTaskSpec := make(map[string]map[string][]*db.Task, len(repos))
	for _, t := range tasks {
	// Filter by repo, since the DB might contain tasks from repos
	// we don't care about.
	if _, ok := repos[t.Repo]; !ok {
	continue
	}

	m, ok := byTaskSpec[t.Repo]
	if !ok {
	m = map[string][]*db.Task{}
	byTaskSpec[t.Repo] = m
	}
	m[t.Name] = append(m[t.Name], t)
	}

	// Find flakes for repos+taskspecs.
	rv := make(map[string]map[string]*Results, len(repos))
	for repoUrl, taskSpecs := range byTaskSpec {
	m := make(map[string]*Results, len(taskSpecs))
	for taskSpec, tasks := range taskSpecs {
	def := DefinitelyFlaky(tasks)
	inf := InfraFailures(tasks)
	may := MaybeFlaky(tasks, start, end, repos[repoUrl])
	if len(def) > 0 \|\| len(inf) > 0 \|\| len(may) > 0 {
	m[taskSpec] = &Results{
	DefinitelyFlaky: def,
	InfraFailures: inf,
	MaybeFlaky: may,
	}
	}
	}
	if len(m) > 0 {
	rv[repoUrl] = m
	}
	}
	return rv, nil
	}

	// InfraFailures finds infrastructure-related failures in the given set of tasks.
	func InfraFailures(tasks []db.Task) []Flake {
	rv := []*Flake{}
	for _, t := range tasks {
	if t.Status == db.TASK_STATUS_MISHAP {
	rv = append(rv, &Flake{
	[]*db.Task{t},
	})
	}
	}
	return rv
	}

	// DefinitelyFlaky finds tasks which failed and whose retries succeeded.
	func DefinitelyFlaky(tasks []db.Task) []Flake {
	// Organize the tasks by ID so that we can find them later.
	byId := make(map[string]*db.Task, len(tasks))
	for _, t := range tasks {
	byId[t.Id] = t
	}

	rv := []*Flake{}
	for _, t := range tasks {
	// If an initial attempt failed but its retry succeeded,
	// it's definitely flaky.
	if t.RetryOf != "" && t.Success() {
	orig, ok := byId[t.RetryOf]
	if !ok {
	sklog.Warningf("Failed to retrieve task %q: not in range", t.RetryOf)
	continue
	}
	if orig.Status == db.TASK_STATUS_MISHAP {
	// Infra failures are handled elsewhere.
	continue
	}
	rv = append(rv, &Flake{
	[]*db.Task{orig, t},
	})
	}
	}
	return rv
	}

	// findTaskSequences finds sequences of N consecutive (commit-wise) tasks.
	func findTaskSequences(n int, tasks []db.Task, start, end time.Time, repo repograph.Graph) [][]*db.Task {
	// First, organize tasks by blamelist.
	byBlamelist := make(map[string]*db.Task, len(tasks))
	for _, t := range tasks {
	for _, c := range t.Commits {
	byBlamelist[c] = t
	}
	}

	// Traverse the commit history, obtain all tasks.
	sequences := [][]*db.Task{}
	visited := map[*db.Task]bool{}
	for _, b := range repo.Branches() {
	seq := make([]*db.Task, 0, n)
	if err := repo.Get(b).Recurse(func(c *repograph.Commit) (bool, error) {
	if start.After(c.Timestamp) {
	return false, nil
	}
	// Keep recursing in this case; we haven't gotten to the commits we care about.
	if c.Timestamp.After(end) {
	return true, nil
	}

	task, ok := byBlamelist[c.Hash]
	if !ok {
	return true, nil
	}
	// Throw out unfinished tasks and those outside the window.
	if task.Created.After(end) \|\| start.After(task.Created) \|\| !task.Done() {
	return true, nil
	}

	// Multiple commits in a row may share the same task.
	var last *db.Task
	if len(seq) > 0 {
	last = seq[len(seq)-1]
	}
	if task == last {
	return true, nil
	}

	// Add this task to the current sequence.
	if len(seq) == cap(seq) {
	copy(seq, seq[1:])
	seq[cap(seq)-1] = task
	} else {
	seq = append(seq, task)
	}
	if len(seq) == cap(seq) {
	// Make sure we haven't seen this sequence before.
	// This can happen in the case of two diverging branches.
	allVisited := true
	for _, t := range seq {
	if !visited[t] {
	allVisited = false
	}
	}
	if allVisited {
	return false, nil
	}

	// Save the sequence.
	cpy := make([]*db.Task, len(seq))
	copy(cpy, seq)
	sequences = append(sequences, cpy)
	}
	visited[task] = true
	return true, nil
	}); err != nil {
	// Recurse only returns an error if the function we pass
	// in returns an error. It doesn't.
	sklog.Errorf("Error: %s", err)
	}
	}
	return sequences
	}

	// revertOf returns the commit hash of the commit which the given commit
	// reverts, or the empty string if the given commit is not a revert.
	func revertOf(c *repograph.Commit) string {
	for _, line := range strings.Split(c.Body, "\n") {
	m := revertRE.FindStringSubmatch(line)
	if m == nil {
	continue
	}
	if len(m) != 2 {
	continue
	}
	return m[1]
	}
	return ""
	}

	// MaybeFlaky finds sequences of tasks which display ping-ponging between
	// success and failure states.
	func MaybeFlaky(tasks []db.Task, start, end time.Time, repo repograph.Graph) []*Flake {
	rv := []*Flake{}

	// TODO(borenet): Should we also look for sequences of four states, eg.
	// FSMF or SFMS?
	sequences := findTaskSequences(3, tasks, start, end, repo)

	// Find sequences which display the alternating-status behavior.
	for _, seq := range sequences {
	// Infra failures are handled elsewhere.
	mishap := false
	for _, t := range seq {
	if t.Status == db.TASK_STATUS_MISHAP {
	mishap = true
	break
	}
	}
	if mishap {
	continue
	}

	// FSF or SFS.
	if seq[0].Status != seq[2].Status {
	continue
	}
	if seq[0].Status == seq[1].Status {
	continue
	}

	// Filter out any of these where the final state change is on a
	// revert. Sequences are in reverse chronological order, since
	// that's how we traversed the repo.
	revertedCommits := util.StringSet{}
	for _, commit := range seq[0].Commits {
	c := repo.Get(commit)
	if c == nil {
	sklog.Errorf("Unable to find commit %q in repo %q", seq[0].Revision, seq[0].Repo)
	continue
	}
	revertedCommit := revertOf(c)
	if revertedCommit != "" {
	revertedCommits[revertedCommit] = true
	}
	}
	wasRevert := len(revertedCommits.Intersect(util.NewStringSet(seq[1].Commits))) > 0
	if wasRevert {
	sklog.Infof("Failure was reverted: %s @ %s", seq[1].Name, seq[1].Revision)
	continue
	}

	// Add the flake to the results.
	flake := make([]*db.Task, len(seq))
	copy(flake, seq)
	rv = append(rv, &Flake{
	Tasks: flake,
	})
	}
	return rv
	}