blob: b6d9af84c4bed0d31af1658c3f9422e9e9dfd6a9 [file] [log] [blame]
package incident
import (
"context"
"crypto/md5"
"encoding/json"
"fmt"
"regexp"
"sort"
"strings"
"time"
"cloud.google.com/go/datastore"
"go.skia.org/infra/am/go/note"
"go.skia.org/infra/am/go/silence"
"go.skia.org/infra/go/alerts"
"go.skia.org/infra/go/ds"
"go.skia.org/infra/go/human"
"go.skia.org/infra/go/metrics2"
"go.skia.org/infra/go/paramtools"
"go.skia.org/infra/go/sklog"
"go.skia.org/infra/go/timer"
"go.skia.org/infra/go/util"
)
// Well known keys for Incident.Params.
const (
ALERT_NAME = "alertname"
CATEGORY = "category"
SEVERITY = "severity"
ID = "id"
ASSIGNED_TO = "assigned_to"
ABBR = "abbr"
OWNER = "owner"
ABBR_OWNER_REGEX = "abbr_owner_regex"
K8S_POD_NAME = "kubernetes_pod_name"
COMMITTED_IMAGE = "committedImage"
LIVE_IMAGE = "liveImage"
)
const (
TX_RETRIES = 5
NUM_RECENTLY_RESOLVED = 20
NUM_RECENTLY_RESOLVED_FOR_ID = 20
)
// Well known alert names.
const (
DirtyCommittedK8sImageAlertName = "DirtyCommittedK8sImage"
StaleK8sImageAlertName = "StaleK8sImage"
DirtyRunningK8sConfigAlertName = "DirtyRunningK8sConfig"
)
// Matches images like gcr.io/skia-public/autoroll-be:2021-04-30T14_04_37Z-borenet-c3ecfbb-dirty
const DockerImageRegexString = ".+?-(?P<Owner>\\w+?)-\\w+?-(dirty|clean)$"
var DockerImageRegex = regexp.MustCompile(DockerImageRegexString)
// Incident - An alert that is being acted on.
//
// Each alert has an ID which is the same each time that exact alert is fired.
// Not to be confused with the Key which is the datastore key for a single
// incident of an alert firing. There will be many Incidents in the datastore
// with the same ID, but at most one will be Active.
type Incident struct {
Key string `json:"key" datastore:"key"` // Key is the web-safe serialized Datastore key for the incident.
ID string `json:"id" datastore:"id"` // Also appears in Params.
Active bool `json:"active" datastore:"active"` // Or archived.
Start int64 `json:"start" datastore:"start"` // Time in seconds since the epoch.
LastSeen int64 `json:"last_seen" datastore:"last_seen"` // Time in seconds since the epoch.
Params paramtools.Params `json:"params" datastore:"-"` // Params
ParamsSerial string `json:"-" datastore:"params_serial,noindex"` // Params serialized as JSON for easy storing in the datastore.
Notes []note.Note `json:"notes" datastore:"notes,flatten"`
}
// Load converts the JSON params back into a map[string]string.
func (in *Incident) Load(ps []datastore.Property) error {
if err := datastore.LoadStruct(in, ps); err != nil {
return err
}
if err := json.Unmarshal([]byte(in.ParamsSerial), &in.Params); err != nil {
return err
}
return nil
}
// Save serializes the params as JSON.
func (in *Incident) Save() ([]datastore.Property, error) {
b, err := json.Marshal(in.Params)
if err != nil {
return nil, err
}
in.ParamsSerial = string(b)
return datastore.SaveStruct(in)
}
// IsSilence returns if any of the given silences apply to this incident.
// Has support for regexes (see skbug.com/9587).
func (in *Incident) IsSilenced(silences []silence.Silence, matchOnlyActiveSilences bool) bool {
ps := paramtools.NewParamSet(in.Params)
for _, s := range silences {
if !s.Active && matchOnlyActiveSilences {
continue
}
if s.ParamSet.Matches(ps) {
return true
}
allSilenceKeysMatched := true
for sKey, sValues := range s.ParamSet {
valueMatchedForKey := false
if iVal, ok := in.Params[sKey]; ok {
for _, sVal := range sValues {
sValRegex := regexp.MustCompile(fmt.Sprintf("^%s$", sVal))
if sValRegex.Match([]byte(iVal)) {
valueMatchedForKey = true
break
}
}
}
if !valueMatchedForKey {
allSilenceKeysMatched = false
break
}
}
if allSilenceKeysMatched {
return true
}
}
return false
}
// Store and retrieve Incidents from Cloud Datastore.
type Store struct {
ignoredAttr []string // key-value pairs to ignore when computing IDs, such as kubernetes_pod_name, instance, and pod_template_hash.
ds *datastore.Client
alertArrivalLatency metrics2.Float64SummaryMetric
}
// NewStore creates a new Store.
//
// ds - Datastore client.
// ignoredAttr - A list of keys to ignore when calculating an Incidents ID.
func NewStore(ds *datastore.Client, ignoredAttr []string) *Store {
ignored := []string{}
ignored = append(ignored, ignoredAttr...)
ignored = append(ignored, alerts.STATE, ID, ASSIGNED_TO)
return &Store{
ignoredAttr: ignored,
ds: ds,
alertArrivalLatency: metrics2.GetFloat64SummaryMetric("alert_manager_store_alert_arrival_latency"),
}
}
// idForAlert calculates the ID for an Incident, which is the md5 sum of all
// the sorted non-ignored keys and values.
func (s *Store) idForAlert(m map[string]string) (string, error) {
if m[ID] != "" {
return m[ID], nil
}
if m[alerts.TYPE] == alerts.TYPE_HEALTHZ {
return "", fmt.Errorf("Healthz events should be ignored.")
}
keys := paramtools.Params(m).Keys()
sort.Strings(keys)
h := md5.New()
for _, key := range keys {
if util.In(key, s.ignoredAttr) {
continue
}
h.Write([]byte(key))
h.Write([]byte(m[key]))
}
return fmt.Sprintf("%x", h.Sum(nil)), nil
}
// getRegexesToOwners expects abbr_owner_regex to be in the form of:
// owner1:abbr_regex1,abbr_regex2;owner2:abbr_regex3,abbr_regex4
func getRegexesToOwners(abbrOwnerRegex string) (map[string]string, error) {
ownersToRegexes := strings.Split(abbrOwnerRegex, ";")
// Map to populate and return.
regexToOwner := map[string]string{}
for _, ownerToRegex := range ownersToRegexes {
tokens := strings.Split(ownerToRegex, ":")
if len(tokens) != 2 {
return nil, fmt.Errorf("%s is not a well-formed abbr_owner_regex", ownerToRegex)
}
owner := tokens[0]
regexes := strings.Split(tokens[1], ",")
for _, regex := range regexes {
regexToOwner[regex] = owner
}
}
return regexToOwner, nil
}
// getOwnerIfMatch returns the owner if there is a match else an empty string is returned.
func getOwnerIfMatch(abbrOwnerRegex, abbr string) (string, error) {
regexesToOwner, err := getRegexesToOwners(abbrOwnerRegex)
if err != nil {
return "", fmt.Errorf("Error when parsing %s: %s", abbrOwnerRegex, err)
}
for regex, owner := range regexesToOwner {
m, err := regexp.MatchString(regex, abbr)
if err != nil {
return "", fmt.Errorf("Error when compiling %s: %s", regex, err)
}
if m {
return owner, nil
}
}
return "", nil
}
// getOwnerFromDockerImage returns the full email address of an owner from a docker image
// if it matches the DockerImageRegex else an empty string is returned.
func getOwnerFromDockerImage(dockerImage string) string {
matches := DockerImageRegex.FindStringSubmatch(dockerImage)
if len(matches) == 0 {
return ""
}
lastIndex := DockerImageRegex.SubexpIndex("Owner")
if lastIndex == -1 {
return ""
}
return fmt.Sprintf("%s@google.com", matches[lastIndex])
}
// inFromAlert creates an Incident from an alert.
func (s *Store) inFromAlert(m map[string]string, id string) *Incident {
m[ID] = id
// The following attempts to determine an owner from the alert's parameters.
//
// 1. Determine owner using the abbr_owner_regex label.
if abbr, abbr_exists := m[ABBR]; abbr_exists {
if abbr_owner_regex, abbr_owner_regex_exists := m[ABBR_OWNER_REGEX]; abbr_owner_regex_exists {
owner, err := getOwnerIfMatch(abbr_owner_regex, abbr)
if err != nil {
sklog.Errorf("Could not match %s with %s: %s", abbr_owner_regex, abbr, err)
} else if owner != "" {
m[OWNER] = owner
}
}
}
if alertName, alertNameExists := m[ALERT_NAME]; alertNameExists {
// 2. Determine owner by parsing the committedImage for DirtyCommittedK8sImage alerts.
if alertName == DirtyCommittedK8sImageAlertName {
if committedImage, committedImageExists := m[COMMITTED_IMAGE]; committedImageExists {
owner := getOwnerFromDockerImage(committedImage)
if owner != "" {
m[OWNER] = owner
}
}
}
// 3. Determine owner by parsing the liveImage for StaleK8sImage alerts.
// 4. Determine owner by parsing the liveImage for DirtyRunningK8sConfig alerts.
if alertName == StaleK8sImageAlertName || alertName == DirtyRunningK8sConfigAlertName {
if liveImage, liveImageExists := m[LIVE_IMAGE]; liveImageExists {
owner := getOwnerFromDockerImage(liveImage)
if owner != "" {
m[OWNER] = owner
}
}
}
}
now := time.Now().Unix()
return &Incident{
Active: true,
ID: id,
Start: now,
LastSeen: now,
Params: m,
Notes: []note.Note{},
}
}
// AlertArrival turns alerts into Incidents, or archives Incidents if
// the arriving state is resolved.
//
// Note that it is possible for the returned incident to be nil even if the
// returned error is non-nil. An example of when this could happen: If we
// receive an alert for an incident that is no longer active.
func (s *Store) AlertArrival(m map[string]string) (*Incident, error) {
defer timer.NewWithSummaryOnly(s.alertArrivalLatency).Stop()
// If there is a matching active alert then just update its LastUpdated
// value, otherwise create a new Incident and store it.
id, err := s.idForAlert(m)
if err != nil {
return nil, err
}
ancestor := ds.NewKey(ds.INCIDENT_ACTIVE_PARENT_AM)
ancestor.Name = id
key := ds.NewKey(ds.INCIDENT_AM)
key.Parent = ancestor
alertState, ok := m[alerts.STATE]
if !ok {
alertState = alerts.STATE_ACTIVE
}
ctx := context.Background()
var active []*Incident
for i := 0; i < TX_RETRIES; i++ {
// Inside a transaction.
var tx *datastore.Transaction
tx, err = s.ds.NewTransaction(ctx)
if err != nil {
return nil, fmt.Errorf("Could not create transaction: %s", err)
}
// Find all active Incidents with the given ID.
q := ds.NewQuery(ds.INCIDENT_AM).Ancestor(ancestor).Filter("active=", true).Transaction(tx)
var keys []*datastore.Key
active = []*Incident{}
keys, err = s.ds.GetAll(ctx, q, &active)
if err != nil {
sklog.Errorf("Failed to retrieve: %s", err)
break
}
// Either create a new Incident or update an existing Incident.
if len(active) == 0 {
if alertState == alerts.STATE_RESOLVED {
sklog.Warningf("Received alert for incident that isn't active. Id: %s Alert: %+v", id, m)
return nil, nil
}
sklog.Infof("New: %s", id)
in := s.inFromAlert(m, id)
active = append(active, in)
} else {
key = keys[0]
active[0].LastSeen = time.Now().Unix()
active[0].Key = key.Encode()
if existingAlertPodName, ok := active[0].Params[K8S_POD_NAME]; ok {
if newAlertPodName, ok := m[K8S_POD_NAME]; ok {
if alertState == alerts.STATE_RESOLVED && newAlertPodName != existingAlertPodName {
// We have received an already resolved alert for a pod that is different than the pod in
// the datastore. This might be an occurence of the problem described in
// https://bugs.chromium.org/p/skia/issues/detail?id=9551#c9
// Logging and leaving the current active alert alone.
sklog.Warningf("Received already resolved alert %+v from pod %s. Ignoring it since there is an active alert with id %s for pod %s", m, newAlertPodName, id, existingAlertPodName)
return nil, nil
}
}
}
}
// Write to the Datastore and keep track of the Incident key.
active[0].Active = alertState != alerts.STATE_RESOLVED
var pending *datastore.PendingKey
pending, err = tx.Put(key, active[0])
if err != nil {
break
}
var commit *datastore.Commit
commit, err = tx.Commit()
if err == datastore.ErrConcurrentTransaction {
continue
}
active[0].Key = commit.Key(pending).Encode()
break
}
if err != nil {
return nil, fmt.Errorf("Failed to save incoming alert %v: %s", m, err)
}
return active[0], nil
}
// _mutateIncident utility function to update an Incident in a transaction.
func (s *Store) _mutateIncident(encodedKey string, mutator func(in *Incident) error) (*Incident, error) {
key, err := datastore.DecodeKey(encodedKey)
if err != nil {
return nil, err
}
var in Incident
_, err = s.ds.RunInTransaction(context.Background(), func(tx *datastore.Transaction) error {
if err := tx.Get(key, &in); err != nil {
return err
}
if err := mutator(&in); err != nil {
return err
}
if _, err := tx.Put(key, &in); err != nil {
return err
}
return nil
})
in.Key = encodedKey
return &in, err
}
func (s *Store) AddNote(encodedKey string, note note.Note) (*Incident, error) {
return s._mutateIncident(encodedKey, func(in *Incident) error {
in.Notes = append(in.Notes, note)
return nil
})
}
func (s *Store) DeleteNote(encodedKey string, index int) (*Incident, error) {
return s._mutateIncident(encodedKey, func(in *Incident) error {
if index < 0 || index > len(in.Notes)-1 {
return fmt.Errorf("Index for delete out of range.")
}
in.Notes = append(in.Notes[:index], in.Notes[index+1:]...)
return nil
})
}
func (s *Store) Assign(encodedKey string, user string) (*Incident, error) {
return s._mutateIncident(encodedKey, func(in *Incident) error {
in.Params[ASSIGNED_TO] = user
return nil
})
}
func (s *Store) Archive(encodedKey string) (*Incident, error) {
return s._mutateIncident(encodedKey, func(in *Incident) error {
in.Active = false
return nil
})
}
// GetAll returns a list of all active Incidents.
func (s *Store) GetAll() ([]Incident, error) {
var active []Incident
q := ds.NewQuery(ds.INCIDENT_AM).Filter("active=", true)
keys, err := s.ds.GetAll(context.Background(), q, &active)
for i, key := range keys {
if active[i].Key == "" {
active[i].Key = key.Encode()
}
}
return active, err
}
// GetRecentlyResolved returns the N most recently archived Incidents.
func (s *Store) GetRecentlyResolved() ([]Incident, error) {
var resolved []Incident
q := ds.NewQuery(ds.INCIDENT_AM).Filter("active=", false).Order("-last_seen").Limit(NUM_RECENTLY_RESOLVED)
keys, err := s.ds.GetAll(context.Background(), q, &resolved)
for i, key := range keys {
if resolved[i].Key == "" {
resolved[i].Key = key.Encode()
}
}
return resolved, err
}
// GetRecentlyResolvedForID returns a list of the N most recent archived Incidents
// that don't match the given key.
func (s *Store) GetRecentlyResolvedForID(id, excludeKey string) ([]Incident, error) {
ancestor := ds.NewKey(ds.INCIDENT_ACTIVE_PARENT_AM)
ancestor.Name = id
var resolved []Incident
q := ds.NewQuery(ds.INCIDENT_AM).Ancestor(ancestor).Filter("active=", false).Order("-last_seen").Limit(NUM_RECENTLY_RESOLVED_FOR_ID)
keys, err := s.ds.GetAll(context.Background(), q, &resolved)
toDelete := -1
for i, key := range keys {
if resolved[i].Key == "" {
resolved[i].Key = key.Encode()
}
if resolved[i].Key == excludeKey {
toDelete = i
}
}
if toDelete != -1 {
resolved = append(resolved[:toDelete], resolved[toDelete+1:]...)
}
return resolved, err
}
// GetRecentlyResolvedInRange returns the most recently archived Incidents in the given range.
//
// d - The range in human units, e.g. "1w".
func (s *Store) GetRecentlyResolvedInRange(d string) ([]Incident, error) {
duration, err := human.ParseDuration(d)
if err != nil {
return nil, fmt.Errorf("Invalid range: %s", err)
}
ts := time.Now().Add(-1 * duration).Unix()
var resolved []Incident
q := ds.NewQuery(ds.INCIDENT_AM).Filter("last_seen>", ts)
keys, err := s.ds.GetAll(context.Background(), q, &resolved)
for i, key := range keys {
if resolved[i].Key == "" {
resolved[i].Key = key.Encode()
}
}
return resolved, err
}
// GetRecentlyResolvedInRangeWithID returns the most recently archived Incidents in the given range.
//
// d - The range in human units, e.g. "1w".
// id - The id of the incidents to return.
func (s *Store) GetRecentlyResolvedInRangeWithID(d, id string) ([]Incident, error) {
duration, err := human.ParseDuration(d)
if err != nil {
return nil, fmt.Errorf("Invalid range: %s", err)
}
ts := time.Now().Add(-1 * duration).Unix()
ancestor := ds.NewKey(ds.INCIDENT_ACTIVE_PARENT_AM)
ancestor.Name = id
var resolved []Incident
q := ds.NewQuery(ds.INCIDENT_AM).Filter("last_seen>", ts).Ancestor(ancestor).Order("-last_seen")
keys, err := s.ds.GetAll(context.Background(), q, &resolved)
for i, key := range keys {
if resolved[i].Key == "" {
resolved[i].Key = key.Encode()
}
}
return resolved, err
}
// AreIncidentsFlaky is a utility function to help determine whether a slice
// of incidents are flaky. Flaky here is defined as alerts which occasionally
// show up and go away on their own with no actions taken to resolve them.
// They are also typically short lived.
//
// numThreshold is the number of incidents required to have sufficient sample
// size. If len(incidents) < numThreshold then incidents are determined to be
// not flaky.
//
// durationThreshold is the duration in seconds below which incidents could be
// considered to be flaky.
//
// durationPercentage. If the percentage of incidents that have durations below
// durationThreshold is less than durationPercentage then the incidents are
// determined to be flaky. Eg: 0.50 for 50%. 1 for 100%.
//
// Summary: The function uses the following to determine flakiness-
// - durationPercentage of incidents lasted less than durationThreshold.
// - Number of incidents must be >= durationThreshold to have sufficient sample
// size.
func AreIncidentsFlaky(incidents []Incident, numThreshold int, durationThreshold int64, durationPercentage float32) bool {
if len(incidents) < numThreshold {
return false
}
durationLessThanThreshold := 0
for _, i := range incidents {
if i.LastSeen-i.Start < durationThreshold {
durationLessThanThreshold++
}
}
return float32(durationLessThanThreshold)/float32(len(incidents)) >= durationPercentage
}