blob: 635577d5e3bb65256d9b99fd938ef9888f525b13 [file] [log] [blame]
package clustering
import (
"fmt"
"math"
"math/rand"
"sort"
)
import "github.com/golang/glog"
import (
"skia.googlesource.com/buildbot.git/perf/go/config"
"skia.googlesource.com/buildbot.git/perf/go/ctrace"
"skia.googlesource.com/buildbot.git/perf/go/kmeans"
"skia.googlesource.com/buildbot.git/perf/go/types"
)
const (
// K is the k in k-means.
K = 50
// MAX_KMEANS_ITERATIONS is the maximum number of k-means iterations to run.
MAX_KMEANS_ITERATIONS = 100
// KMEAN_EPSILON is the smallest change in the k-means total error we will
// accept per iteration. If the change in error falls below KMEAN_EPSILON
// the iteration will terminate.
KMEAN_EPSILON = 1.0
// INTERESTING_THRESHHOLD is the threshhold value beyond which
// StepFit.Regression values become interesting, i.e. they may indicate real
// regressions or improvements.
INTERESTING_THRESHHOLD = 150.0
)
// ClusterSummaries is one summary for each cluster that the k-means clustering
// found.
type ClusterSummaries struct {
Clusters []*types.ClusterSummary
StdDevThreshhold float64
K int
}
func NewClusterSummaries() *ClusterSummaries {
return &ClusterSummaries{
Clusters: []*types.ClusterSummary{},
StdDevThreshhold: config.MIN_STDDEV,
K: K,
}
}
// chooseK chooses a random sample of k observations. Used as the starting
// point for the k-means clustering.
func chooseK(observations []kmeans.Clusterable, k int) []kmeans.Clusterable {
popN := len(observations)
centroids := make([]kmeans.Clusterable, k)
for i := 0; i < k; i++ {
o := observations[rand.Intn(popN)].(*ctrace.ClusterableTrace)
cp := &ctrace.ClusterableTrace{
Key: "I'm a centroid",
Values: make([]float64, len(o.Values)),
}
copy(cp.Values, o.Values)
centroids[i] = cp
}
return centroids
}
// traceToFlot converts the data into a format acceptable to the Flot plotting
// library.
//
// Flot expects data formatted as an array of [x, y] pairs.
func traceToFlot(t *ctrace.ClusterableTrace) [][]float64 {
ret := make([][]float64, len(t.Values))
for i, x := range t.Values {
ret[i] = []float64{float64(i), x}
}
return ret
}
// ValueWeightSortable is a utility class for sorting the types.ValueWeight's by Weight.
type ValueWeightSortable []types.ValueWeight
func (p ValueWeightSortable) Len() int { return len(p) }
func (p ValueWeightSortable) Less(i, j int) bool { return p[i].Weight > p[j].Weight } // Descending.
func (p ValueWeightSortable) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
// getParamSummaries summaries all the parameters for all observations in a cluster.
//
// The return value is an array of []types.ValueWeight's, one []types.ValueWeight per
// parameter. The members of each []types.ValueWeight are sorted by the Weight, with
// higher Weight's first.
func getParamSummaries(cluster []kmeans.Clusterable) [][]types.ValueWeight {
// For each cluster member increment each parameters count.
type ValueMap map[string]int
counts := map[string]ValueMap{}
clusterSize := float64(len(cluster))
// First figure out what parameters and values appear in the cluster.
for _, o := range cluster {
for k, v := range o.(*ctrace.ClusterableTrace).Params {
if v == "" {
continue
}
if _, ok := counts[k]; !ok {
counts[k] = ValueMap{}
counts[k][v] = 0
}
counts[k][v] += 1
}
}
// Now calculate the weights for each parameter value. The weight of each
// value is proportional to the number of times it appears on an observation
// versus all other values for the same parameter.
ret := make([][]types.ValueWeight, 0)
for _, count := range counts {
weights := []types.ValueWeight{}
for value, weight := range count {
weights = append(weights, types.ValueWeight{
Value: value,
Weight: int(14*float64(weight)/clusterSize) + 12,
})
}
sort.Sort(ValueWeightSortable(weights))
ret = append(ret, weights)
}
return ret
}
// average calculates and returns the average value of the given []float64.
func average(xs []float64) float64 {
total := 0.0
for _, v := range xs {
total += v
}
return total / float64(len(xs))
}
// sse calculates and returns the sum squared error from the given base of []float64.
func sse(xs []float64, base float64) float64 {
total := 0.0
for _, v := range xs {
total += math.Pow(v-base, 2)
}
return total
}
// getStepFit takes one []float64 trace and calculates and returns a types.StepFit.
//
// See types.StepFit for a description of the values being calculated.
func getStepFit(trace []float64) *types.StepFit {
lse := math.MaxFloat64
stepSize := -1.0
turn := 0
for i, _ := range trace {
if i == 0 {
continue
}
y0 := average(trace[:i])
y1 := average(trace[i:])
if y0 == y1 {
continue
}
d := math.Sqrt(sse(trace[:i], y0)+sse(trace[i:], y1)) / float64(len(trace))
if d < lse {
lse = d
stepSize = (y0 - y1)
turn = i
}
}
regression := stepSize / lse
status := "Uninteresting"
if regression > INTERESTING_THRESHHOLD {
status = "High"
} else if regression < -INTERESTING_THRESHHOLD {
status = "Low"
}
return &types.StepFit{
LeastSquares: lse,
StepSize: stepSize,
TurningPoint: turn,
Regression: regression,
Status: status,
}
}
type SortableClusterable struct {
Cluster kmeans.Clusterable
Distance float64
}
type SortableClusterableSlice []*SortableClusterable
func (p SortableClusterableSlice) Len() int { return len(p) }
func (p SortableClusterableSlice) Less(i, j int) bool { return p[i].Distance < p[j].Distance }
func (p SortableClusterableSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
type SortableClusterSummarySlice []*types.ClusterSummary
func (p SortableClusterSummarySlice) Len() int { return len(p) }
func (p SortableClusterSummarySlice) Less(i, j int) bool {
return p[i].StepFit.Regression < p[j].StepFit.Regression
}
func (p SortableClusterSummarySlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
// GetClusterSummaries returns a summaries for each cluster.
func GetClusterSummaries(observations, centroids []kmeans.Clusterable, commits []*types.Commit) *ClusterSummaries {
ret := &ClusterSummaries{
Clusters: make([]*types.ClusterSummary, len(centroids)),
}
allClusters, _ := kmeans.GetClusters(observations, centroids)
for i, cluster := range allClusters {
// cluster is just an array of the observations for a given cluster.
numSampleTraces := len(cluster)
if numSampleTraces > config.MAX_SAMPLE_TRACES_PER_CLUSTER {
numSampleTraces = config.MAX_SAMPLE_TRACES_PER_CLUSTER
}
stepFit := getStepFit(cluster[0].(*ctrace.ClusterableTrace).Values)
summary := types.NewClusterSummary(len(cluster)-1, numSampleTraces)
summary.ParamSummaries = getParamSummaries(cluster)
summary.StepFit = stepFit
summary.Hash = commits[stepFit.TurningPoint].Hash
summary.Timestamp = commits[stepFit.TurningPoint].CommitTime
for j, o := range cluster {
if j != 0 {
summary.Keys[j-1] = o.(*ctrace.ClusterableTrace).Key
}
}
// First, sort the traces so they are order with the traces closest to the
// centroid first.
sc := []*SortableClusterable{}
for j := 0; j < len(cluster); j++ {
sc = append(sc, &SortableClusterable{Cluster: cluster[j], Distance: cluster[j].Distance(cluster[0])})
}
// Sort, but leave the centroid, the 0th element, unmoved.
sort.Sort(SortableClusterableSlice(sc[1:]))
for j, clusterable := range sc {
if j == numSampleTraces {
break
}
summary.Traces[j] = traceToFlot(clusterable.Cluster.(*ctrace.ClusterableTrace))
}
ret.Clusters[i] = summary
}
sort.Sort(SortableClusterSummarySlice(ret.Clusters))
return ret
}
// Filter returns true if a trace should be included in clustering.
type Filter func(key string, tr *types.PerfTrace) bool
// CalculateClusterSummaries runs k-means clustering over the trace shapes.
func CalculateClusterSummaries(tile *types.Tile, k int, stddevThreshhold float64, filter Filter) (*ClusterSummaries, error) {
lastCommitIndex := tile.LastCommitIndex()
observations := make([]kmeans.Clusterable, 0, len(tile.Traces))
for key, trace := range tile.Traces {
if filter(key, trace.(*types.PerfTrace)) {
observations = append(observations, ctrace.NewFullTrace(string(key), trace.(*types.PerfTrace).Values[:lastCommitIndex+1], trace.Params(), stddevThreshhold))
}
}
if len(observations) == 0 {
return nil, fmt.Errorf("Zero traces matched.")
}
// Create K starting centroids.
centroids := chooseK(observations, k)
// TODO(jcgregorio) Keep iterating until the total error stops changing.
lastTotalError := 0.0
for i := 0; i < MAX_KMEANS_ITERATIONS; i++ {
centroids = kmeans.Do(observations, centroids, ctrace.CalculateCentroid)
totalError := kmeans.TotalError(observations, centroids)
glog.Infof("Total Error: %f\n", totalError)
if math.Abs(totalError-lastTotalError) < KMEAN_EPSILON {
break
}
lastTotalError = totalError
}
clusterSummaries := GetClusterSummaries(observations, centroids, tile.Commits)
clusterSummaries.K = k
clusterSummaries.StdDevThreshhold = stddevThreshhold
return clusterSummaries, nil
}