[autoroll] Limit to 3 retries of the same roll Additionally, delay sending the notification that a roll failed and updating metrics until the roll is closed (either landed or abandoned after 3 attempts). Bug: skia:10696 Change-Id: I53d70c02177c37a1717019f52f8bdcff0eee8459 Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/436056 Commit-Queue: Eric Boren <borenet@google.com> Reviewed-by: Ravi Mistry <rmistry@google.com>

commit: 5758def28ecb6cededae8fb444ce8d5fe8043430 [log] [tgz]
author: Eric Boren <borenet@google.com> Tue Aug 03 12:18:46 2021 -0400
committer: SkCQ <skcq-be@skia-corp.google.com.iam.gserviceaccount.com> Tue Aug 03 17:08:43 2021 +0000
tree: 93c5e59ab16758090525e71c550e15d53aa12c06
parent: 751ab60fb293be2f328d9408524f08fdd10b1a67 [diff]
diff --git a/autoroll/go/codereview/roll.go b/autoroll/go/codereview/roll.go
index bd386bd..f561a59 100644
--- a/autoroll/go/codereview/roll.go
+++ b/autoroll/go/codereview/roll.go

@@ -273,6 +273,7 @@
 			return skerr.Wrap(err)
 		}
 		r.issue.IsDryRun = false
+		r.issue.Attempt++
 		return nil
 	})
 }
@@ -287,13 +288,14 @@
 			return skerr.Wrap(err)
 		}
 		r.issue.IsDryRun = true
+		r.issue.Attempt++
 		return nil
 	})
 }
 
 // See documentation for state_machine.RollCLImpl interface.
 func (r *gerritRoll) Update(ctx context.Context) error {
-	alreadyFinished := r.IsFinished()
+	alreadyClosed := r.IsClosed()
 	ci, err := r.retrieveRoll(ctx)
 	if err != nil {
 		return err
@@ -305,12 +307,17 @@
 	if err := r.recent.Update(ctx, r.issue); err != nil {
 		return err
 	}
-	if r.IsFinished() && !alreadyFinished && r.finishedCallback != nil {
+	if r.IsClosed() && !alreadyClosed && r.finishedCallback != nil {
 		return r.finishedCallback(ctx, r)
 	}
 	return nil
 }
 
+// See documentation for state_machine.RollClImpl interface.
+func (r *gerritRoll) Attempt() int {
+	return r.issue.Attempt
+}
+
 // See documentation for state_machine.RollCLImpl interface.
 func (r *gerritRoll) IssueID() string {
 	return fmt.Sprintf("%d", r.issue.Issue)
@@ -488,7 +495,7 @@
 
 // See documentation for state_machine.RollCLImpl interface.
 func (r *githubRoll) Update(ctx context.Context) error {
-	alreadyFinished := r.IsFinished()
+	alreadyClosed := r.IsClosed()
 	pullRequest, err := r.retrieveRoll(ctx)
 	if err != nil {
 		return err
@@ -500,7 +507,7 @@
 	if err := r.recent.Update(ctx, r.issue); err != nil {
 		return err
 	}
-	if r.IsFinished() && !alreadyFinished && r.finishedCallback != nil {
+	if r.IsClosed() && !alreadyClosed && r.finishedCallback != nil {
 		return r.finishedCallback(ctx, r)
 	}
 	return nil
@@ -573,6 +580,7 @@
 			return err
 		}
 		r.issue.IsDryRun = false
+		r.issue.Attempt++
 		return nil
 	})
 }
@@ -584,10 +592,16 @@
 			return err
 		}
 		r.issue.IsDryRun = true
+		r.issue.Attempt++
 		return nil
 	})
 }
 
+// See documentation for state_machine.RollClImpl interface.
+func (r *githubRoll) Attempt() int {
+	return r.issue.Attempt
+}
+
 // See documentation for state_machine.RollCLImpl interface.
 func (r *githubRoll) IssueID() string {
 	return fmt.Sprintf("%d", r.issue.Issue)

diff --git a/autoroll/go/state_machine/state_machine.go b/autoroll/go/state_machine/state_machine.go
index dfd9086..46f2cbf 100644
--- a/autoroll/go/state_machine/state_machine.go
+++ b/autoroll/go/state_machine/state_machine.go

@@ -65,6 +65,9 @@
 	// arbitrary limit just to keep us from performing an unbounded number
 	// of transitions at a time.
 	MAX_NOOP_TRANSITIONS = 10
+
+	// maxRollCQAttempts is the maximum number of CQ attempts per roll.
+	maxRollCQAttempts = 3
 )
 
 // Interface for interacting with a single autoroll CL.
@@ -72,6 +75,10 @@
 	// Add a comment to the CL.
 	AddComment(context.Context, string) error
 
+	// Attempt returns the number of the current attempt for this roll, starting
+	// at zero for the first attempt.
+	Attempt() int
+
 	// Close the CL. The first string argument is the result of the roll,
 	// and the second is the message to add to the CL on closing.
 	Close(context.Context, string, string) error
@@ -598,15 +605,17 @@
 		if currentRoll.IsClosed() {
 			return S_NORMAL_IDLE, nil
 		}
-		throttle := s.a.FailureThrottle()
-		if err := throttle.Inc(ctx); err != nil {
-			return "", err
-		}
-		if s.a.GetNextRollRev().Id == currentRoll.RollingTo().Id {
-			// Rather than upload the same CL again, we'll try
-			// running the CQ again after a period of throttling.
-			if throttle.IsThrottled() {
-				return S_NORMAL_FAILURE_THROTTLED, nil
+		if currentRoll.Attempt() < maxRollCQAttempts-1 {
+			throttle := s.a.FailureThrottle()
+			if err := throttle.Inc(ctx); err != nil {
+				return "", err
+			}
+			if s.a.GetNextRollRev().Id == currentRoll.RollingTo().Id {
+				// Rather than upload the same CL again, we'll try
+				// running the CQ again after a period of throttling.
+				if throttle.IsThrottled() {
+					return S_NORMAL_FAILURE_THROTTLED, nil
+				}
 			}
 		}
 		return S_NORMAL_IDLE, nil
@@ -727,13 +736,15 @@
 		if currentRoll == nil {
 			return S_CURRENT_ROLL_MISSING, nil
 		}
-		if err := s.a.FailureThrottle().Inc(ctx); err != nil {
-			return "", err
-		}
-		if s.a.GetNextRollRev().Id == currentRoll.RollingTo().Id {
-			// Rather than upload the same CL again, we'll try
-			// running the CQ again after a period of throttling.
-			return S_DRY_RUN_FAILURE_THROTTLED, nil
+		if currentRoll.Attempt() < maxRollCQAttempts-1 {
+			if err := s.a.FailureThrottle().Inc(ctx); err != nil {
+				return "", err
+			}
+			if s.a.GetNextRollRev().Id == currentRoll.RollingTo().Id {
+				// Rather than upload the same CL again, we'll try
+				// running the CQ again after a period of throttling.
+				return S_DRY_RUN_FAILURE_THROTTLED, nil
+			}
 		}
 		return S_DRY_RUN_IDLE, nil
 	case S_DRY_RUN_FAILURE_THROTTLED:

diff --git a/autoroll/go/state_machine/state_machine_test.go b/autoroll/go/state_machine/state_machine_test.go
index 97640c8..1d42df3 100644
--- a/autoroll/go/state_machine/state_machine_test.go
+++ b/autoroll/go/state_machine/state_machine_test.go

@@ -23,6 +23,7 @@
 // A completely mocked implementation of RollCLImpl.
 type TestRollCLImpl struct {
 	t            *testing.T
+	attempt      int
 	closedStatus string
 	closedMsg    string
 	isDryRun     bool
@@ -47,6 +48,11 @@
 }
 
 // See documentation for RollCLImpl.
+func (r *TestRollCLImpl) Attempt() int {
+	return r.attempt
+}
+
+// See documentation for RollCLImpl.
 func (r *TestRollCLImpl) Close(ctx context.Context, status, msg string) error {
 	r.closedStatus = status
 	r.closedMsg = msg
@@ -147,6 +153,7 @@
 func (r *TestRollCLImpl) RetryDryRun(ctx context.Context) error {
 	r.isDryRun = true
 	r.dryRunResult = ""
+	r.attempt++
 	return nil
 }
 
@@ -154,6 +161,7 @@
 func (r *TestRollCLImpl) RetryCQ(ctx context.Context) error {
 	r.isDryRun = false
 	r.normalResult = ""
+	r.attempt++
 	return nil
 }
 
@@ -852,3 +860,69 @@
 		require.NoError(t, sm.NextTransition(ctx))
 	}
 }
+
+func TestAbandonAfterTooManyAttempts(t *testing.T) {
+	ctx, sm, r, gcsClient, cleanup := setup(t)
+	defer cleanup()
+
+	counterFile := "fail_counter"
+	failureThrottle, err := NewThrottler(ctx, gcsClient, counterFile, time.Hour, 1)
+	require.NoError(t, err)
+	r.failureThrottle = failureThrottle
+
+	r.SetNextRollRev("HEAD+1")
+
+	failCL := func(expectedState string) {
+		checkNextState(t, sm, S_NORMAL_ACTIVE)
+		roll := r.GetActiveRoll().(*TestRollCLImpl)
+		roll.SetFailed()
+		checkNextState(t, sm, S_NORMAL_FAILURE)
+		checkNextState(t, sm, expectedState)
+	}
+
+	for i := 0; i < maxRollCQAttempts-1; i++ {
+		failCL(S_NORMAL_FAILURE_THROTTLED)
+
+		// Hack the timer to fake that the throttling has expired, then ensure
+		// that we end up at the expected state.
+		require.NoError(t, gcsClient.DeleteFile(ctx, counterFile))
+		failureThrottle, err = NewThrottler(ctx, gcsClient, counterFile, time.Minute, 1)
+		require.NoError(t, err)
+		r.failureThrottle = failureThrottle
+	}
+	failCL(S_NORMAL_IDLE)
+}
+
+func TestAbandonAfterTooManyAttemptsDryRun(t *testing.T) {
+	ctx, sm, r, gcsClient, cleanup := setup(t)
+	defer cleanup()
+
+	counterFile := "fail_counter"
+	failureThrottle, err := NewThrottler(ctx, gcsClient, counterFile, time.Hour, 1)
+	require.NoError(t, err)
+	r.failureThrottle = failureThrottle
+
+	r.SetMode(ctx, modes.ModeDryRun)
+	checkNextState(t, sm, S_DRY_RUN_IDLE)
+	r.SetNextRollRev("HEAD+1")
+
+	failCL := func(expectedState string) {
+		checkNextState(t, sm, S_DRY_RUN_ACTIVE)
+		roll := r.GetActiveRoll().(*TestRollCLImpl)
+		roll.SetDryRunFailed()
+		checkNextState(t, sm, S_DRY_RUN_FAILURE)
+		checkNextState(t, sm, expectedState)
+	}
+
+	for i := 0; i < maxRollCQAttempts-1; i++ {
+		failCL(S_DRY_RUN_FAILURE_THROTTLED)
+
+		// Hack the timer to fake that the throttling has expired, then ensure
+		// that we end up at the expected state.
+		require.NoError(t, gcsClient.DeleteFile(ctx, counterFile))
+		failureThrottle, err = NewThrottler(ctx, gcsClient, counterFile, time.Minute, 1)
+		require.NoError(t, err)
+		r.failureThrottle = failureThrottle
+	}
+	failCL(S_DRY_RUN_IDLE)
+}

diff --git a/go/autoroll/autoroll.go b/go/autoroll/autoroll.go
index b1b7a28..25c411e 100644
--- a/go/autoroll/autoroll.go
+++ b/go/autoroll/autoroll.go

@@ -75,6 +75,7 @@
 // AutoRollIssue is a struct containing the information we care about for
 // AutoRoll CLs.
 type AutoRollIssue struct {
+	Attempt        int                `json:"attempts"`
 	Closed         bool               `json:"closed"`
 	Comments       []*comment.Comment `json:"comments"`
 	Committed      bool               `json:"committed"`
@@ -139,6 +140,7 @@
 		}
 	}
 	return &AutoRollIssue{
+		Attempt:        i.Attempt,
 		Closed:         i.Closed,
 		Comments:       commentsCpy,
 		Committed:      i.Committed,

diff --git a/go/autoroll/autoroll_test.go b/go/autoroll/autoroll_test.go
index 93e1b6e..2809357 100644
--- a/go/autoroll/autoroll_test.go
+++ b/go/autoroll/autoroll_test.go

@@ -17,7 +17,8 @@
 func TestAutoRollIssueCopy(t *testing.T) {
 	unittest.SmallTest(t)
 	roll := &AutoRollIssue{
-		Closed: true,
+		Attempt: 2,
+		Closed:  true,
 		Comments: []*comment.Comment{
 			{
 				Id:        "123",
commit	5758def28ecb6cededae8fb444ce8d5fe8043430	[log] [tgz]
author	Eric Boren <borenet@google.com>	Tue Aug 03 12:18:46 2021 -0400
committer	SkCQ <skcq-be@skia-corp.google.com.iam.gserviceaccount.com>	Tue Aug 03 17:08:43 2021 +0000
tree	93c5e59ab16758090525e71c550e15d53aa12c06
parent	751ab60fb293be2f328d9408524f08fdd10b1a67 [diff]