[CT] Do not retry tasks if they have been "killed"
This is to make master scripts stop retrying tasks while tasks are being killed via the delete button.
Bug: skia:8163
Change-Id: I4836170b840b6ef6e4283de9de3854f118c11897
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/240678
Commit-Queue: Ravi Mistry <rmistry@google.com>
Reviewed-by: Ben Wagner aka dogben <benjaminwagner@google.com>
diff --git a/ct/go/ctfe/task_common/task_common.go b/ct/go/ctfe/task_common/task_common.go
index 44d9573..d717099 100644
--- a/ct/go/ctfe/task_common/task_common.go
+++ b/ct/go/ctfe/task_common/task_common.go
@@ -517,7 +517,7 @@
sklog.Errorf("Could not cancel %s: %s", t.TaskId, err)
continue
}
- sklog.Infof("Deleted %s", t.TaskId)
+ sklog.Infof("Canceled %s", t.TaskId)
}
}()
}
diff --git a/ct/go/util/util.go b/ct/go/util/util.go
index a3cdeba..2ba6926 100644
--- a/ct/go/util/util.go
+++ b/ct/go/util/util.go
@@ -531,8 +531,12 @@
task := task // https://golang.org/doc/faq#closures_and_goroutines
go func() {
defer wg.Done()
- if _, _, err := task.Collect(ctx, s, false, false); err != nil {
+ if _, _, state, err := task.Collect(ctx, s, false, false); err != nil {
sklog.Errorf("task %s failed: %s", task.Title, err)
+ if state == swarming.TASK_STATE_KILLED {
+ sklog.Infof("task %s was killed (either manually or via CT's delete button). Not going to retry it.", task.Title)
+ return
+ }
sklog.Infof("Retrying task %s with high priority %d", task.Title, TASKS_PRIORITY_HIGH)
retryTask, err := s.TriggerSwarmingTasks(ctx, map[string]string{task.Title: tasksToHashes[task.Title]}, dimensions, map[string]string{"runid": runID}, []string{}, TASKS_PRIORITY_HIGH, 7*24*time.Hour, hardTimeout, ioTimeout, false, true, getServiceAccount(dimensions))
if err != nil {
@@ -540,7 +544,7 @@
return
}
// Collect the retried task.
- if _, _, err := retryTask[0].Collect(ctx, s, false, false); err != nil {
+ if _, _, _, err := retryTask[0].Collect(ctx, s, false, false); err != nil {
sklog.Errorf("task %s failed inspite of a retry: %s", retryTask[0].Title, err)
return
}
@@ -1017,7 +1021,7 @@
}
// Collect all tasks and log the ones that fail.
task := tasks[0]
- _, outputDir, err := task.Collect(ctx, s, false, false)
+ _, outputDir, _, err := task.Collect(ctx, s, false, false)
if err != nil {
return "", fmt.Errorf("task %s failed: %s", task.Title, err)
}
@@ -1132,7 +1136,7 @@
}
// Collect all tasks and log the ones that fail.
task := tasks[0]
- _, outputDir, err := task.Collect(ctx, s, false, false)
+ _, outputDir, _, err := task.Collect(ctx, s, false, false)
if err != nil {
return nil, fmt.Errorf("task %s failed: %s", task.Title, err)
}
diff --git a/go/swarming/swarming.go b/go/swarming/swarming.go
index ad604af..1c3f999 100644
--- a/go/swarming/swarming.go
+++ b/go/swarming/swarming.go
@@ -57,8 +57,8 @@
}
type ShardOutputFormat struct {
- ExitCode string `json:"exit_code"`
- Output string `json:"output"`
+ Output string `json:"output"`
+ State string `json:"state"`
}
type TaskOutputFormat struct {
@@ -138,9 +138,17 @@
return nil
}
-func (t *SwarmingTask) Collect(ctx context.Context, s *SwarmingClient, logStdout, logStderr bool) (string, string, error) {
- if err := _VerifyBinaryExists(ctx, s.SwarmingPy); err != nil {
- return "", "", fmt.Errorf("Could not find swarming binary: %s", err)
+// Collect collects the swarming task. It is a blocking call that returns only after the task
+// completes. It returns the following:
+// * Output of the task.
+// * Location of the ${ISOLATED_OUTDIR}.
+// * State of the task. Eg: COMPLETED/KILLED.
+// * Error is non-nil if something goes wrong. If the command to collect returns a non-zero exit
+// code then error is non-nil but all of the above (output, outdir, state) are also returned if
+// known. This is useful for checking if a task failed because it was cancelled.
+func (t *SwarmingTask) Collect(ctx context.Context, s *SwarmingClient, logStdout, logStderr bool) (string, string, string, error) {
+ if verifyErr := _VerifyBinaryExists(ctx, s.SwarmingPy); verifyErr != nil {
+ return "", "", "", fmt.Errorf("Could not find swarming binary: %s", verifyErr)
}
dumpJSON := path.Join(t.OutputDir, fmt.Sprintf("%s-trigger-output.json", t.Title))
@@ -155,35 +163,38 @@
if s.ServiceAccountJSON != "" {
collectArgs = append(collectArgs, "--auth-service-account-json", s.ServiceAccountJSON)
}
- err := exec.Run(ctx, &exec.Command{
+ collectCmdErr := exec.Run(ctx, &exec.Command{
Name: s.SwarmingPy,
Args: collectArgs,
Timeout: t.Expiration,
LogStdout: logStdout,
LogStderr: logStderr,
})
- if err != nil {
- return "", "", fmt.Errorf("Swarming trigger for %s failed with: %s", t.Title, err)
- }
+ // Read and parse the summary file if it exists before checking for the error.
outputSummaryFile := path.Join(t.OutputDir, "summary.json")
- outputSummary, err := ioutil.ReadFile(outputSummaryFile)
- if err != nil {
- return "", "", fmt.Errorf("Could not read output summary %s: %s", outputSummaryFile, err)
- }
- var summaryOutput TaskOutputFormat
- if err := json.NewDecoder(bytes.NewReader(outputSummary)).Decode(&summaryOutput); err != nil {
- return "", "", fmt.Errorf("Could not decode %s: %s", outputSummaryFile, err)
+ output := ""
+ state := ""
+ if _, statErr := os.Stat(outputSummaryFile); statErr == nil {
+
+ outputSummary, readErr := ioutil.ReadFile(outputSummaryFile)
+ if readErr != nil {
+ return "", "", "", fmt.Errorf("Could not read output summary %s: %s", outputSummaryFile, readErr)
+ }
+ var summaryOutput TaskOutputFormat
+ if decodeErr := json.NewDecoder(bytes.NewReader(outputSummary)).Decode(&summaryOutput); decodeErr != nil {
+ return "", "", "", fmt.Errorf("Could not decode %s: %s", outputSummaryFile, decodeErr)
+ }
+ output = summaryOutput.Shards[0].Output
+ state = summaryOutput.Shards[0].State
}
- exitCode := summaryOutput.Shards[0].ExitCode
- output := summaryOutput.Shards[0].Output
// Directory that will contain output written to ${ISOLATED_OUTDIR}.
outputDir := path.Join(t.OutputDir, "0")
- if exitCode != "0" {
- return output, outputDir, fmt.Errorf("Non-zero exit code: %s", summaryOutput.Shards[0].ExitCode)
+ if collectCmdErr != nil {
+ return output, outputDir, state, fmt.Errorf("Swarming collect for %s failed with: %s", t.Title, collectCmdErr)
}
- return output, outputDir, nil
+ return output, outputDir, state, nil
}
// NewSwarmingClient returns an instance of Swarming populated with default
diff --git a/go/swarming/swarming_test.go b/go/swarming/swarming_test.go
index dc34105..6a42cb9 100644
--- a/go/swarming/swarming_test.go
+++ b/go/swarming/swarming_test.go
@@ -120,7 +120,7 @@
// Collect both output and file output of all tasks.
for _, task := range tasks {
- output, outputDir, err := task.Collect(ctx, s, true, true)
+ output, outputDir, _, err := task.Collect(ctx, s, true, true)
assert.NoError(t, err)
output = sanitizeOutput(output)
assert.Equal(t, fmt.Sprintf("arg_1_%s\narg_2_%s\n", task.Title, task.Title), output)
@@ -183,7 +183,7 @@
assert.NoError(t, err)
// Collect testTask1. It should have failed.
- output1, outputDir1, err1 := tasks[0].Collect(ctx, s, true, true)
+ output1, outputDir1, _, err1 := tasks[0].Collect(ctx, s, true, true)
assert.Equal(t, tags, tasks[0].Tags)
output1 = sanitizeOutput(output1)
assert.Equal(t, "", output1)
@@ -192,7 +192,7 @@
assert.True(t, strings.HasPrefix(err1.Error(), "Swarming trigger for testTask1 failed with: Command exited with exit status 1: "))
// Collect testTask2. It should have succeeded.
- output2, outputDir2, err2 := tasks[1].Collect(ctx, s, true, true)
+ output2, outputDir2, _, err2 := tasks[1].Collect(ctx, s, true, true)
assert.NoError(t, err2)
assert.Equal(t, tags, tasks[1].Tags)
output2 = sanitizeOutput(output2)