blob: 32b638f4f899aa253c44eec0e4660e40cb1b36ec [file] [log] [blame]
// Package compare compares two arrays of data and determines if the two arrays
// are statistically different, the same, or unknown
//
// The method is roughly:
// - Calculate the p-value using the KS test and the MWU test
// - Take the minimum of the two p-values
// - If the p-value < low_threshold (0.01), return different
// - Else if the p-value > high_threshold, return same
// - Else return unknown
//
// See [thresholds] for more context on the thresholds
//
// # Functional bisections vs performance bisections:
//
// Most bisections are performance, meaning they measure performance regressions
// of benchmarks and stories on a device. Functional bisections measure how often
// a test fails during experimentation and if a CL can be attributed to an increase
// in the failure rate. i.e., how functional is the measurement at that build of Chrome.
//
// Functional bisection is not explictly implemented, due to its low demand. However,
// the analysis used in functional bisection is built into performance bisection.
// Sometimes a test will fail and rather than throw out the data, performance
// bisection backdoors into a functional bisection by using the functional
// bisection's thresholds.
//
// Algorithmically, there is no difference between the two, only the mathematical
// parameters used to define differences and their data inputs. Functional bisection
// deals with data in 0s and 1s (fail or success) and performance bisection deals
// with rational nonnegative numbers.
package compare
import (
"math"
"sort"
"go.skia.org/infra/go/skerr"
"go.skia.org/infra/go/sklog"
"go.skia.org/infra/pinpoint/go/compare/stats"
"go.skia.org/infra/pinpoint/go/compare/thresholds"
)
// ImprovementDir is the improvement direction of the measurement
// being measured. The directions are either up, down, or unknown.
type ImprovementDir string
// These ImprovementDirs are the possible improvement directions.
const (
// UnknownDir means the job request did not send an improvement
// direction. Rather than infer it, we assume the direction
// is unknown and drill deeper on all statistically significant
// changes.
UnknownDir ImprovementDir = "UnknownDir"
// Up means the improvement direction is increasing.
Up ImprovementDir = "Up"
// Down means the improvement direction is decreasing.
Down ImprovementDir = "Down"
)
// define verdict enums
type Verdict string
// These verdicts are the possible results of the statistical analysis.
const (
// Unknown means that there is not enough evidence to reject
// either hypothesis. Collect more data before making a final decision.
Unknown Verdict = "Unknown"
// Same means that the sample likely come from the same distribution.
// Cannot reject the null hypothesis.
Same Verdict = "Same"
// Different means that the samples are unlikely to come
// from the same distribution. Reject the null hypothesis.
Different Verdict = "Different"
// NilVerdict means there was no analysis to be done.
// This can happen in performance comparisons when all
// benchmark runs fail and there is no data to analyze.
NilVerdict Verdict = "Nil"
// ErrorVerdict means something went wrong with the analysis.
// Returning this verdict is better than returning a nil struct.
ErrorVerdict Verdict = "Error"
)
const float64EqualityThreshold = 1e-9
func almostEqual(a, b float64) bool {
return math.Abs(a-b) <= float64EqualityThreshold
}
func mean(arr []float64) float64 {
if len(arr) == 0 {
return 0
}
var sum float64
for _, v := range arr {
sum += v
}
return sum / float64(len(arr))
}
// ComparePairwiseResult contains the results of a pairwise comparison
// between two samples
type ComparePairwiseResult struct {
// Verdict is the outcome of the statistical analysis which is Same or Different.
// Note that pairwise does not have an Unknown verdict.
Verdict Verdict
// stats.PairwiseWilcoxonSignedRankedTestResult is the result of the Pairwise
// statistical analysis.
stats.PairwiseWilcoxonSignedRankedTestResult
}
// ComparePairwise wraps PairwiseWilcoxonSignedRankedTest.
// TODO(sunxiaodi@): Add additional data manipulation CABE does to ensure
// data is valid prior to analysis.
func ComparePairwise(valuesA, valuesB []float64, dir ImprovementDir) (*ComparePairwiseResult, error) {
// The pairwise test is intentionally designed to receive the input backwards.
result, err := stats.PairwiseWilcoxonSignedRankedTest(valuesB, valuesA, stats.TwoSided, stats.LogTransform)
if err != nil {
return nil, skerr.Wrap(err)
}
// return same if improvement
if (dir == Up && result.LowerCi > 0) || (dir == Down && result.UpperCi < 0) {
return &ComparePairwiseResult{
Verdict: Same,
PairwiseWilcoxonSignedRankedTestResult: result,
}, nil
}
// at p-value < 0.05, it is unlikely for the 95% confidence interval to encompass 0,
// but we enforce a check just in case.
if result.PValue < thresholds.LowThreshold && result.LowerCi*result.UpperCi > 0 {
return &ComparePairwiseResult{
Verdict: Different,
PairwiseWilcoxonSignedRankedTestResult: result,
}, nil
}
return &ComparePairwiseResult{
Verdict: Same,
PairwiseWilcoxonSignedRankedTestResult: result,
}, nil
}
// Based on https://source.chromium.org/chromium/chromium/src/+/main:third_party/catapult/dashboard/dashboard/pinpoint/models/job_state.py;drc=94f2bff5159bf660910b35c39426102c5982c4a4;l=356
// the default functional analysis error rate expected is 1.0 for all bisections
// pivoting to functional analysis.
const DefaultFunctionalErrRate = 1.0
// CompareResults contains the results of a comparison between two samples.
// TODO(b/299537769): update verdict to use protos
type CompareResults struct {
// Verdict is the outcome of the statistical analysis which is either
// Unknown, Same, or Different.
Verdict Verdict
// PValue is the consolidated p-value for the statistical tests used.
PValue float64
// PValueKS is the p-value estimate from the KS test
PValueKS float64
// PValueMWU is the p-value estimate from the MWU test
PValueMWU float64
// LowThreshold is `alpha` where if the p-value is lower means we can
// reject the null hypothesis.
LowThreshold float64
// HighThreshold is the `alpha` where if the p-value is lower means we need
// more information to make a definitive judgement.
HighThreshold float64
// MeanDiff is the difference between the mean of B and the mean of A.
// MeanDiff > 0 means the mean of B > mean of A.
// MeanDiff is used to decide if a difference is a regression or not.
MeanDiff float64
}
// CompareFunctional determines if valuesA and valuesB are statistically different,
// statistically same or unknown from each other using the functional low and high thresholds.
// Functional analysis compares failure rates between A and B.
// The expectedErrRate expresses how much the culprit CL is responsible for flakiness
// in a benchmark measurement. i.e. expectedErrRate = 0.5 means the culprit is
// causing the benchmark to fail 50% of the time more often.
func CompareFunctional(valuesA, valuesB []float64, expectedErrRate float64) (*CompareResults, error) {
// This is technically not possible. It would imply that there were no benchmark runs scheduled
// or all scheduled runs terminally errored out.
if len(valuesA) == 0 || len(valuesB) == 0 {
return &CompareResults{Verdict: ErrorVerdict}, skerr.Fmt("cannot do functional comparison without data. len A %d and len B %d", len(valuesA), len(valuesB))
}
all_values := append(valuesA, valuesB...)
// avgSampleSize refers to the average number of samples between A and B.
// The samples may be imbalanced depending on the success of individual runs
avgSampleSize := len(all_values) / 2
if expectedErrRate < 0.0 || expectedErrRate > 1.0 {
sklog.Warning("Magnitude used in functional analysis was outside of the range of 0 and 1. Switching to default magnitude of 1.0")
expectedErrRate = DefaultFunctionalErrRate
}
LowThreshold := thresholds.LowThreshold
HighThreshold, err := thresholds.HighThresholdFunctional(expectedErrRate, avgSampleSize)
if err != nil {
return &CompareResults{Verdict: ErrorVerdict}, skerr.Wrapf(err, "Could not get functional high threshold")
}
// functional analysis always assumes the improvement direction is down
// i.e. we want future commits to be less flaky and less error prone.
return compare(valuesA, valuesB, LowThreshold, HighThreshold, Down)
}
// ComparePerformance determines if valuesA and valuesB are statistically different,
// statistically same or unknown from each other based on the perceived
// rawMagnitude difference between valuesA and valuesB using the performance
// low and high thresholds.
func ComparePerformance(valuesA, valuesB []float64, rawMagnitude float64, direction ImprovementDir) (*CompareResults, error) {
// This situation happens if all benchmark runs fail.
if len(valuesA) == 0 || len(valuesB) == 0 {
return &CompareResults{Verdict: NilVerdict}, nil
}
all_values := append(valuesA, valuesB...)
sort.Float64s(all_values)
iqr := all_values[len(all_values)*3/4] - all_values[len(all_values)/4]
normalizedMagnitude := float64(1.0)
if !almostEqual(rawMagnitude, 0.0) {
normalizedMagnitude = math.Abs(rawMagnitude / iqr)
}
// avgSampleSize refers to the average number of samples between A and B.
// The samples may be imbalanced depending on the success of individual runs
avgSampleSize := len(all_values) / 2
LowThreshold := thresholds.LowThreshold
HighThreshold, err := thresholds.HighThresholdPerformance(normalizedMagnitude, avgSampleSize)
if err != nil {
return &CompareResults{Verdict: ErrorVerdict}, skerr.Wrapf(err, "Could not get high threshold for bisection")
}
return compare(valuesA, valuesB, LowThreshold, HighThreshold, direction)
}
// compare decides whether two samples are the same, different, or unknown
// using the KS and MWU tests and compare their p-values against the
// LowThreshold and HighThreshold.
func compare(valuesA, valuesB []float64, LowThreshold, HighThreshold float64, dir ImprovementDir) (*CompareResults, error) {
// verify a change is a regression
meanDiff := mean(valuesB) - mean(valuesA)
if (dir == Up && meanDiff > 0) || (dir == Down && meanDiff < 0) {
return &CompareResults{
Verdict: Same,
MeanDiff: meanDiff,
}, nil
}
// MWU is bad at detecting changes in variance, and K-S is bad with discrete
// distributions. So use both. We want low p-values for the below examples.
// a b MWU(a, b) KS(a, b)
// [0]*20 [0]*15+[1]*5 0.0097 0.4973
// range(10, 30) range(10)+range(30, 40) 0.4946 0.0082
PValueKS, err := KolmogorovSmirnov(valuesA, valuesB)
if err != nil {
return &CompareResults{Verdict: ErrorVerdict}, skerr.Wrapf(err, "Failed KS test")
}
PValueMWU := MannWhitneyU(valuesA, valuesB)
if err != nil {
return &CompareResults{Verdict: ErrorVerdict}, skerr.Wrapf(err, "Failed MWU test")
}
PValue := min(PValueKS, PValueMWU)
var verdict Verdict
if PValue <= LowThreshold {
// The p-value is less than the significance level. Reject the null
// hypothesis.
verdict = Different
} else if PValue <= HighThreshold {
// The p-value is not less than the significance level, but it's small
// enough to be suspicious. We'd like to investigate more closely.
verdict = Unknown
} else {
// The p-value is quite large. We're not suspicious that the two samples might
// come from different distributions, and we don't care to investigate more.
verdict = Same
}
return &CompareResults{
Verdict: verdict,
PValue: PValue,
PValueKS: PValueKS,
PValueMWU: PValueMWU,
LowThreshold: LowThreshold,
HighThreshold: HighThreshold,
MeanDiff: meanDiff,
}, nil
}