[CT] Do not retry tasks if they have been "killed"

This is to make master scripts stop retrying tasks while tasks are being killed via the delete button.

Bug: skia:8163
Change-Id: I4836170b840b6ef6e4283de9de3854f118c11897
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/240678
Commit-Queue: Ravi Mistry <rmistry@google.com>
Reviewed-by: Ben Wagner aka dogben <benjaminwagner@google.com>
diff --git a/ct/go/ctfe/task_common/task_common.go b/ct/go/ctfe/task_common/task_common.go
index 44d9573..d717099 100644
--- a/ct/go/ctfe/task_common/task_common.go
+++ b/ct/go/ctfe/task_common/task_common.go
@@ -517,7 +517,7 @@
 						sklog.Errorf("Could not cancel %s: %s", t.TaskId, err)
 						continue
 					}
-					sklog.Infof("Deleted  %s", t.TaskId)
+					sklog.Infof("Canceled  %s", t.TaskId)
 				}
 			}()
 		}
diff --git a/ct/go/util/util.go b/ct/go/util/util.go
index a3cdeba..2ba6926 100644
--- a/ct/go/util/util.go
+++ b/ct/go/util/util.go
@@ -531,8 +531,12 @@
 			task := task // https://golang.org/doc/faq#closures_and_goroutines
 			go func() {
 				defer wg.Done()
-				if _, _, err := task.Collect(ctx, s, false, false); err != nil {
+				if _, _, state, err := task.Collect(ctx, s, false, false); err != nil {
 					sklog.Errorf("task %s failed: %s", task.Title, err)
+					if state == swarming.TASK_STATE_KILLED {
+						sklog.Infof("task %s was killed (either manually or via CT's delete button). Not going to retry it.", task.Title)
+						return
+					}
 					sklog.Infof("Retrying task %s with high priority %d", task.Title, TASKS_PRIORITY_HIGH)
 					retryTask, err := s.TriggerSwarmingTasks(ctx, map[string]string{task.Title: tasksToHashes[task.Title]}, dimensions, map[string]string{"runid": runID}, []string{}, TASKS_PRIORITY_HIGH, 7*24*time.Hour, hardTimeout, ioTimeout, false, true, getServiceAccount(dimensions))
 					if err != nil {
@@ -540,7 +544,7 @@
 						return
 					}
 					// Collect the retried task.
-					if _, _, err := retryTask[0].Collect(ctx, s, false, false); err != nil {
+					if _, _, _, err := retryTask[0].Collect(ctx, s, false, false); err != nil {
 						sklog.Errorf("task %s failed inspite of a retry: %s", retryTask[0].Title, err)
 						return
 					}
@@ -1017,7 +1021,7 @@
 	}
 	// Collect all tasks and log the ones that fail.
 	task := tasks[0]
-	_, outputDir, err := task.Collect(ctx, s, false, false)
+	_, outputDir, _, err := task.Collect(ctx, s, false, false)
 	if err != nil {
 		return "", fmt.Errorf("task %s failed: %s", task.Title, err)
 	}
@@ -1132,7 +1136,7 @@
 	}
 	// Collect all tasks and log the ones that fail.
 	task := tasks[0]
-	_, outputDir, err := task.Collect(ctx, s, false, false)
+	_, outputDir, _, err := task.Collect(ctx, s, false, false)
 	if err != nil {
 		return nil, fmt.Errorf("task %s failed: %s", task.Title, err)
 	}
diff --git a/go/swarming/swarming.go b/go/swarming/swarming.go
index ad604af..1c3f999 100644
--- a/go/swarming/swarming.go
+++ b/go/swarming/swarming.go
@@ -57,8 +57,8 @@
 }
 
 type ShardOutputFormat struct {
-	ExitCode string `json:"exit_code"`
-	Output   string `json:"output"`
+	Output string `json:"output"`
+	State  string `json:"state"`
 }
 
 type TaskOutputFormat struct {
@@ -138,9 +138,17 @@
 	return nil
 }
 
-func (t *SwarmingTask) Collect(ctx context.Context, s *SwarmingClient, logStdout, logStderr bool) (string, string, error) {
-	if err := _VerifyBinaryExists(ctx, s.SwarmingPy); err != nil {
-		return "", "", fmt.Errorf("Could not find swarming binary: %s", err)
+// Collect collects the swarming task. It is a blocking call that returns only after the task
+// completes. It returns the following:
+// * Output of the task.
+// * Location of the ${ISOLATED_OUTDIR}.
+// * State of the task. Eg: COMPLETED/KILLED.
+// * Error is non-nil if something goes wrong. If the command to collect returns a non-zero exit
+//   code then error is non-nil but all of the above (output, outdir, state) are also returned if
+//   known. This is useful for checking if a task failed because it was cancelled.
+func (t *SwarmingTask) Collect(ctx context.Context, s *SwarmingClient, logStdout, logStderr bool) (string, string, string, error) {
+	if verifyErr := _VerifyBinaryExists(ctx, s.SwarmingPy); verifyErr != nil {
+		return "", "", "", fmt.Errorf("Could not find swarming binary: %s", verifyErr)
 	}
 	dumpJSON := path.Join(t.OutputDir, fmt.Sprintf("%s-trigger-output.json", t.Title))
 
@@ -155,35 +163,38 @@
 	if s.ServiceAccountJSON != "" {
 		collectArgs = append(collectArgs, "--auth-service-account-json", s.ServiceAccountJSON)
 	}
-	err := exec.Run(ctx, &exec.Command{
+	collectCmdErr := exec.Run(ctx, &exec.Command{
 		Name:      s.SwarmingPy,
 		Args:      collectArgs,
 		Timeout:   t.Expiration,
 		LogStdout: logStdout,
 		LogStderr: logStderr,
 	})
-	if err != nil {
-		return "", "", fmt.Errorf("Swarming trigger for %s failed with: %s", t.Title, err)
-	}
 
+	// Read and parse the summary file if it exists before checking for the error.
 	outputSummaryFile := path.Join(t.OutputDir, "summary.json")
-	outputSummary, err := ioutil.ReadFile(outputSummaryFile)
-	if err != nil {
-		return "", "", fmt.Errorf("Could not read output summary %s: %s", outputSummaryFile, err)
-	}
-	var summaryOutput TaskOutputFormat
-	if err := json.NewDecoder(bytes.NewReader(outputSummary)).Decode(&summaryOutput); err != nil {
-		return "", "", fmt.Errorf("Could not decode %s: %s", outputSummaryFile, err)
+	output := ""
+	state := ""
+	if _, statErr := os.Stat(outputSummaryFile); statErr == nil {
+
+		outputSummary, readErr := ioutil.ReadFile(outputSummaryFile)
+		if readErr != nil {
+			return "", "", "", fmt.Errorf("Could not read output summary %s: %s", outputSummaryFile, readErr)
+		}
+		var summaryOutput TaskOutputFormat
+		if decodeErr := json.NewDecoder(bytes.NewReader(outputSummary)).Decode(&summaryOutput); decodeErr != nil {
+			return "", "", "", fmt.Errorf("Could not decode %s: %s", outputSummaryFile, decodeErr)
+		}
+		output = summaryOutput.Shards[0].Output
+		state = summaryOutput.Shards[0].State
 	}
 
-	exitCode := summaryOutput.Shards[0].ExitCode
-	output := summaryOutput.Shards[0].Output
 	// Directory that will contain output written to ${ISOLATED_OUTDIR}.
 	outputDir := path.Join(t.OutputDir, "0")
-	if exitCode != "0" {
-		return output, outputDir, fmt.Errorf("Non-zero exit code: %s", summaryOutput.Shards[0].ExitCode)
+	if collectCmdErr != nil {
+		return output, outputDir, state, fmt.Errorf("Swarming collect for %s failed with: %s", t.Title, collectCmdErr)
 	}
-	return output, outputDir, nil
+	return output, outputDir, state, nil
 }
 
 // NewSwarmingClient returns an instance of Swarming populated with default
diff --git a/go/swarming/swarming_test.go b/go/swarming/swarming_test.go
index dc34105..6a42cb9 100644
--- a/go/swarming/swarming_test.go
+++ b/go/swarming/swarming_test.go
@@ -120,7 +120,7 @@
 
 	// Collect both output and file output of all tasks.
 	for _, task := range tasks {
-		output, outputDir, err := task.Collect(ctx, s, true, true)
+		output, outputDir, _, err := task.Collect(ctx, s, true, true)
 		assert.NoError(t, err)
 		output = sanitizeOutput(output)
 		assert.Equal(t, fmt.Sprintf("arg_1_%s\narg_2_%s\n", task.Title, task.Title), output)
@@ -183,7 +183,7 @@
 	assert.NoError(t, err)
 
 	// Collect testTask1. It should have failed.
-	output1, outputDir1, err1 := tasks[0].Collect(ctx, s, true, true)
+	output1, outputDir1, _, err1 := tasks[0].Collect(ctx, s, true, true)
 	assert.Equal(t, tags, tasks[0].Tags)
 	output1 = sanitizeOutput(output1)
 	assert.Equal(t, "", output1)
@@ -192,7 +192,7 @@
 	assert.True(t, strings.HasPrefix(err1.Error(), "Swarming trigger for testTask1 failed with: Command exited with exit status 1: "))
 
 	// Collect testTask2. It should have succeeded.
-	output2, outputDir2, err2 := tasks[1].Collect(ctx, s, true, true)
+	output2, outputDir2, _, err2 := tasks[1].Collect(ctx, s, true, true)
 	assert.NoError(t, err2)
 	assert.Equal(t, tags, tasks[1].Tags)
 	output2 = sanitizeOutput(output2)