blob: a8084f1fbe8baf78d1720383e68c0cf886986651 [file] [log] [blame]
package clustering2
import (
"fmt"
"math"
"math/rand"
"sort"
"time"
"go.skia.org/infra/go/query"
"go.skia.org/infra/go/sklog"
"go.skia.org/infra/perf/go/config"
"go.skia.org/infra/perf/go/ctrace2"
"go.skia.org/infra/perf/go/dataframe"
"go.skia.org/infra/perf/go/kmeans"
"go.skia.org/infra/perf/go/stepfit"
"go.skia.org/infra/perf/go/types"
)
// timeNow allows testing calls to time.Now().
var timeNow = time.Now
const (
// K is the k in k-means.
K = 50
// MAX_KMEANS_ITERATIONS is the maximum number of k-means iterations to run.
MAX_KMEANS_ITERATIONS = 100
// KMEAN_EPSILON is the smallest change in the k-means total error we will
// accept per iteration. If the change in error falls below KMEAN_EPSILON
// the iteration will terminate.
KMEAN_EPSILON = 1.0
)
// ClusterSummary is a summary of a single cluster of traces.
type ClusterSummary struct {
// Centroid is the calculated centroid of the cluster.
Centroid []float32 `json:"centroid"`
// Keys of all the members of the Cluster.
//
// The keys are sorted so that the ones at the beginning of the list are
// closest to the centroid.
//
// Note: This value is not serialized to JSON.
Keys []string `json:"-"`
// Shortcut is the id of a shortcut for the above Keys.
Shortcut string `json:"shortcut"`
// ParamSummaries is a summary of all the parameters in the cluster.
ParamSummaries []ValuePercent `json:"param_summaries2"`
// StepFit is info on the fit of the centroid to a step function.
StepFit *stepfit.StepFit `json:"step_fit"`
// StepPoint is the ColumnHeader for the step point.
StepPoint *dataframe.ColumnHeader `json:"step_point"`
// Num is the number of observations that are in this cluster.
Num int `json:"num"`
// Timestamp is the timestamp when this regression was found.
Timestamp time.Time `json:"ts"`
}
// NewClusterSummary returns a new ClusterSummary.
func NewClusterSummary() *ClusterSummary {
return &ClusterSummary{
Keys: []string{},
ParamSummaries: []ValuePercent{},
StepFit: &stepfit.StepFit{},
StepPoint: &dataframe.ColumnHeader{},
Timestamp: timeNow(),
}
}
// ClusterSummaries is one summary for each cluster that the k-means clustering
// found.
type ClusterSummaries struct {
Clusters []*ClusterSummary
StdDevThreshold float32
K int
}
// chooseK chooses a random sample of k observations. Used as the starting
// point for the k-means clustering.
func chooseK(observations []kmeans.Clusterable, k int) []kmeans.Centroid {
popN := len(observations)
centroids := make([]kmeans.Centroid, k)
for i := 0; i < k; i++ {
centroids[i] = observations[rand.Intn(popN)].(*ctrace2.ClusterableTrace).Dup(ctrace2.CENTROID_KEY)
}
return centroids
}
// getParamSummaries summarizes all the parameters for all observations in a
// cluster.
//
// The return value is an array of []ValueWeight's, one []ValueWeight per
// parameter. The members of each []ValueWeight are sorted by the Weight, with
// higher Weight's first.
func getParamSummaries(cluster []kmeans.Clusterable) []ValuePercent {
keys := make([]string, 0, len(cluster))
for _, o := range cluster {
key := o.(*ctrace2.ClusterableTrace).Key
if key == ctrace2.CENTROID_KEY {
continue
}
keys = append(keys, key)
}
return GetParamSummariesForKeys(keys)
}
// GetParamSummariesForKeys summarizes all the parameters for all observations in a
// cluster.
//
// The return value is an array of []ValueWeight's, one []ValueWeight per
// parameter. The members of each []ValueWeight are sorted by the Weight, with
// higher Weight's first.
func GetParamSummariesForKeys(keys []string) []ValuePercent {
// For each cluster member increment each parameters count.
// map[key] map[value] count
counts := map[string]int{}
clusterSize := len(keys)
// First figure out what parameters and values appear in the cluster.
for _, key := range keys {
params, err := query.ParseKey(key)
if err != nil {
sklog.Errorf("Invalid key found in Cluster: %s", err)
continue
}
for k, v := range params {
if v == "" {
continue
}
counts[k+"="+v] += 1
}
}
ret := []ValuePercent{}
for key, count := range counts {
ret = append(ret, ValuePercent{
Value: key,
Percent: (100 * count) / clusterSize,
})
}
SortValuePercentSlice(ret)
return ret
}
// sortableClusterable allows for sorting kmeans.Clusterables.
type sortableClusterable struct {
Observation kmeans.Clusterable
Distance float64
}
type sortableClusterableSlice []*sortableClusterable
func (p sortableClusterableSlice) Len() int { return len(p) }
func (p sortableClusterableSlice) Less(i, j int) bool { return p[i].Distance < p[j].Distance }
func (p sortableClusterableSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
type sortableClusterSummarySlice []*ClusterSummary
func (p sortableClusterSummarySlice) Len() int { return len(p) }
func (p sortableClusterSummarySlice) Less(i, j int) bool {
return p[i].StepFit.Regression < p[j].StepFit.Regression
}
func (p sortableClusterSummarySlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
// getClusterSummaries returns a summary for each cluster.
func getClusterSummaries(observations []kmeans.Clusterable, centroids []kmeans.Centroid, header []*dataframe.ColumnHeader, interesting float32, stepDetection types.StepDetection, stddevThreshhold float32) *ClusterSummaries {
ret := &ClusterSummaries{
Clusters: make([]*ClusterSummary, len(centroids)),
}
allClusters, _ := kmeans.GetClusters(observations, centroids)
for i, cluster := range allClusters {
// cluster is just an array of the observations for a given cluster.
// Drop the first value which is the centroid.
cluster = cluster[1:]
numSampleKeys := len(cluster)
if numSampleKeys > config.MaxSampleTracesPerCluster {
numSampleKeys = config.MaxSampleTracesPerCluster
}
stepFit := stepfit.GetStepFitAtMid(centroids[i].(*ctrace2.ClusterableTrace).Values, stddevThreshhold, interesting, stepDetection)
summary := NewClusterSummary()
summary.ParamSummaries = getParamSummaries(cluster)
summary.StepFit = stepFit
summary.StepPoint = header[stepFit.TurningPoint]
summary.Num = len(cluster)
// First, sort the traces so they are ordered with the traces closest to
// the centroid first.
sc := []*sortableClusterable{}
for j := 0; j < len(cluster); j++ {
sc = append(sc, &sortableClusterable{
Observation: cluster[j],
Distance: centroids[i].Distance(cluster[j]),
})
}
sort.Sort(sortableClusterableSlice(sc))
for _, o := range sc[:numSampleKeys] {
summary.Keys = append(summary.Keys, o.Observation.(*ctrace2.ClusterableTrace).Key)
}
summary.Centroid = centroids[i].(*ctrace2.ClusterableTrace).Values
ret.Clusters[i] = summary
}
sort.Sort(sortableClusterSummarySlice(ret.Clusters))
return ret
}
// Progress is a function that is called periodically with the progress being
// made in clustering.
type Progress func(totalError float64)
// CalculateClusterSummaries runs k-means clustering over the trace shapes.
func CalculateClusterSummaries(df *dataframe.DataFrame, k int, stddevThreshold float32, progress Progress, interesting float32, stepDetection types.StepDetection) (*ClusterSummaries, error) {
// Convert the DataFrame to a slice of kmeans.Clusterable.
observations := make([]kmeans.Clusterable, 0, len(df.TraceSet))
for key, trace := range df.TraceSet {
observations = append(observations, ctrace2.NewFullTrace(key, trace, stddevThreshold))
}
if len(observations) == 0 {
return nil, fmt.Errorf("Zero traces in the DataFrame.")
}
// Create K starting centroids.
centroids := chooseK(observations, k)
lastTotalError := 0.0
for i := 0; i < MAX_KMEANS_ITERATIONS; i++ {
centroids = kmeans.Do(observations, centroids, ctrace2.CalculateCentroid)
totalError := kmeans.TotalError(observations, centroids)
if progress != nil {
progress(totalError)
}
if math.Abs(totalError-lastTotalError) < KMEAN_EPSILON {
break
}
lastTotalError = totalError
}
clusterSummaries := getClusterSummaries(observations, centroids, df.Header, interesting, stepDetection, stddevThreshold)
clusterSummaries.K = k
clusterSummaries.StdDevThreshold = stddevThreshold
return clusterSummaries, nil
}