fiddle - Another attempt at fixing the 500 issue with fiddle.

This might really work as I learned that:
  * When pushing out a new version of fiddler, which happens a lot
    of the time because it is on a continuous deployment system, that
    up to 80 pods will be unavailable at any one time.

  * Also, that as pods come up, they may be reported via r.clientset.CoreV1().List(),
    but may not have an IP address assigned yet.

Combined that means that close to 1/3 of the pods we were trying to talk to
could actually be either gone, or hot have an IP address assigned to them.
So this fix polls CoreV1().List() more frequently and then throws out
the IP addresses that the empty string.

Bug: skia:8582
Change-Id: Ide73a46c76f634ba58b4f753d3de31c74e6eb36b
Reviewed-on: https://skia-review.googlesource.com/c/178863
Reviewed-by: Joe Gregorio <jcgregorio@google.com>
Commit-Queue: Joe Gregorio <jcgregorio@google.com>
diff --git a/fiddlek/go/client/client.go b/fiddlek/go/client/client.go
index 00bf42a..5aadb9a 100644
--- a/fiddlek/go/client/client.go
+++ b/fiddlek/go/client/client.go
@@ -4,6 +4,7 @@
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"io/ioutil"
 	"net/http"
 	"time"
 
@@ -36,7 +37,12 @@
 		if failFast {
 			sklog.Fatalf("Send failed, with fail_fast set: %s", resp.Status)
 		}
-		sklog.Infof("Send failed: %s", resp.Status)
+		body := "(no body found)"
+		b, err := ioutil.ReadAll(resp.Body)
+		if err == nil {
+			body = string(b)
+		}
+		sklog.Infof("Send failed %q: %s", body, resp.Status)
 		time.Sleep(sleep)
 		return nil, false
 	}
diff --git a/fiddlek/go/runner/runner.go b/fiddlek/go/runner/runner.go
index 9511081..9af35c0 100644
--- a/fiddlek/go/runner/runner.go
+++ b/fiddlek/go/runner/runner.go
@@ -121,7 +121,10 @@
 		}
 		ips = make([]string, 0, len(pods.Items))
 		for _, p := range pods.Items {
-			ips = append(ips, p.Status.PodIP)
+			// Note that the PodIP can be the empty string. Who knew?
+			if p.Status.PodIP != "" {
+				ips = append(ips, p.Status.PodIP)
+			}
 		}
 	}
 	r.mutex.Lock()
@@ -132,9 +135,12 @@
 
 // fiddlerIPsRefresher refreshes a list of fiddler pod IP addresses.
 func (r *Runner) fiddlerIPsRefresher() {
-	for _ = range time.Tick(time.Minute) {
+	fiddlerIPLiveness := metrics2.NewLiveness("fiddler_ips")
+	for _ = range time.Tick(5 * time.Second) {
 		if err := r.fiddlerIPsOneStep(); err != nil {
 			sklog.Warningf("Failed to refresh fiddler IPs: %s", err)
+		} else {
+			fiddlerIPLiveness.Reset()
 		}
 	}
 }
@@ -204,7 +210,7 @@
 	var output bytes.Buffer
 	resp, err := r.client.Post(url, "application/json", body)
 	if err != nil {
-		sklog.Errorf("Failed to POST: %s", err)
+		sklog.Errorf("Failed to POST to %q: %s", url, err)
 		return nil, failedToSendErr
 	}
 	defer util.Close(resp.Body)
@@ -224,7 +230,7 @@
 	res := &types.Result{}
 	if err := json.Unmarshal(output.Bytes(), res); err != nil {
 		sklog.Errorf("Received erroneous output: %q", truncOutput)
-		return nil, fmt.Errorf("Failed to decode results from run: %s, %q", err, truncOutput)
+		return nil, fmt.Errorf("Failed to decode results from run at %q: %s, %q", url, err, truncOutput)
 	}
 	if strings.HasPrefix(res.Execute.Errors, "Invalid JSON Request") {
 		sklog.Errorf("Failed to send valid JSON: res.Execute.Errors : %s", err)
@@ -349,9 +355,11 @@
 
 // Metrics captures metrics on the state of all the fiddler pods.
 func (r *Runner) Metrics() {
+	metricsLiveness := metrics2.NewLiveness("metrics")
 	r.metricsSingleStep()
-	for _ = range time.Tick(2 * time.Minute) {
+	for _ = range time.Tick(10 * time.Second) {
 		r.metricsSingleStep()
+		metricsLiveness.Reset()
 	}
 }