Revert "Copy code from https://github.com/golang/perf/tree/master/internal/stats" This reverts commit 9a045f1d04a74d0723a642847bde4e02c02aa9f0. Reason for revert: Found the upstream code at github.com/aclements/go-moremath so there's no need to do this copy. Original change's description: > Copy code from https://github.com/golang/perf/tree/master/internal/stats > > Because we can't import an 'internal' directory. > > Since that code has been stable for close to four years, this seems relatively > safe. > > The code is copyright the Go authors using the Go License. See the added LICENSE > file. > > The only changes are: > 1. Add unittest.SmallTest() > 2. gofmt -s -w . > > Bug: skia:10884 > Change-Id: I8d1d922686762a9799cc7ed2d57867c498ce93ab > Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/331369 > Reviewed-by: Kevin Lubick <kjlubick@google.com> > Commit-Queue: Joe Gregorio <jcgregorio@google.com> TBR=jcgregorio@google.com,kjlubick@google.com Change-Id: I305c17737bf6696a3669fe3fd4e7ce6b6b389cc2 No-Presubmit: true No-Tree-Checks: true No-Try: true Bug: skia:10884 Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/331556 Reviewed-by: Joe Gregorio <jcgregorio@google.com> Commit-Queue: Joe Gregorio <jcgregorio@google.com>
diff --git a/perf/LICENSE b/perf/LICENSE deleted file mode 100644 index ea5ea89..0000000 --- a/perf/LICENSE +++ /dev/null
@@ -1,27 +0,0 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file
diff --git a/perf/go/stats/README.md b/perf/go/stats/README.md deleted file mode 100644 index 6349ebd..0000000 --- a/perf/go/stats/README.md +++ /dev/null
@@ -1,11 +0,0 @@ -# stats - -This code is copied from -https://github.com/golang/perf/tree/master/internal/stats because we can't -import an 'internal' directory. - -Since that code has been stable for close to four years, this seems relatively -safe. - -The code is copyright the Go authors using the Go License. See the local LICENSE -file.
diff --git a/perf/go/stats/alg.go b/perf/go/stats/alg.go deleted file mode 100644 index f3774b5..0000000 --- a/perf/go/stats/alg.go +++ /dev/null
@@ -1,112 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -// Miscellaneous helper algorithms - -import ( - "fmt" -) - -func maxint(a, b int) int { - if a > b { - return a - } - return b -} - -func minint(a, b int) int { - if a < b { - return a - } - return b -} - -func sumint(xs []int) int { - sum := 0 - for _, x := range xs { - sum += x - } - return sum -} - -// bisect returns an x in [low, high] such that |f(x)| <= tolerance -// using the bisection method. -// -// f(low) and f(high) must have opposite signs. -// -// If f does not have a root in this interval (e.g., it is -// discontiguous), this returns the X of the apparent discontinuity -// and false. -func bisect(f func(float64) float64, low, high, tolerance float64) (float64, bool) { - flow, fhigh := f(low), f(high) - if -tolerance <= flow && flow <= tolerance { - return low, true - } - if -tolerance <= fhigh && fhigh <= tolerance { - return high, true - } - if mathSign(flow) == mathSign(fhigh) { - panic(fmt.Sprintf("root of f is not bracketed by [low, high]; f(%g)=%g f(%g)=%g", low, flow, high, fhigh)) - } - for { - mid := (high + low) / 2 - fmid := f(mid) - if -tolerance <= fmid && fmid <= tolerance { - return mid, true - } - if mid == high || mid == low { - return mid, false - } - if mathSign(fmid) == mathSign(flow) { - low = mid - flow = fmid - } else { - high = mid - fhigh = fmid - } - } -} - -// bisectBool implements the bisection method on a boolean function. -// It returns x1, x2 ∈ [low, high], x1 < x2 such that f(x1) != f(x2) -// and x2 - x1 <= xtol. -// -// If f(low) == f(high), it panics. -func bisectBool(f func(float64) bool, low, high, xtol float64) (x1, x2 float64) { - flow, fhigh := f(low), f(high) - if flow == fhigh { - panic(fmt.Sprintf("root of f is not bracketed by [low, high]; f(%g)=%v f(%g)=%v", low, flow, high, fhigh)) - } - for { - if high-low <= xtol { - return low, high - } - mid := (high + low) / 2 - if mid == high || mid == low { - return low, high - } - fmid := f(mid) - if fmid == flow { - low = mid - flow = fmid - } else { - high = mid - fhigh = fmid - } - } -} - -// series returns the sum of the series f(0), f(1), ... -// -// This implementation is fast, but subject to round-off error. -func series(f func(float64) float64) float64 { - y, yp := 0.0, 1.0 - for n := 0.0; y != yp; n++ { - yp = y - y += f(n) - } - return y -}
diff --git a/perf/go/stats/beta.go b/perf/go/stats/beta.go deleted file mode 100644 index 9cd1cd7..0000000 --- a/perf/go/stats/beta.go +++ /dev/null
@@ -1,93 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import "math" - -func lgamma(x float64) float64 { - y, _ := math.Lgamma(x) - return y -} - -// mathBeta returns the value of the complete beta function B(a, b). -func mathBeta(a, b float64) float64 { - // B(x,y) = Γ(x)Γ(y) / Γ(x+y) - return math.Exp(lgamma(a) + lgamma(b) - lgamma(a+b)) -} - -// mathBetaInc returns the value of the regularized incomplete beta -// function Iₓ(a, b). -// -// This is not to be confused with the "incomplete beta function", -// which can be computed as BetaInc(x, a, b)*Beta(a, b). -// -// If x < 0 or x > 1, returns NaN. -func mathBetaInc(x, a, b float64) float64 { - // Based on Numerical Recipes in C, section 6.4. This uses the - // continued fraction definition of I: - // - // (xᵃ*(1-x)ᵇ)/(a*B(a,b)) * (1/(1+(d₁/(1+(d₂/(1+...)))))) - // - // where B(a,b) is the beta function and - // - // d_{2m+1} = -(a+m)(a+b+m)x/((a+2m)(a+2m+1)) - // d_{2m} = m(b-m)x/((a+2m-1)(a+2m)) - if x < 0 || x > 1 { - return math.NaN() - } - bt := 0.0 - if 0 < x && x < 1 { - // Compute the coefficient before the continued - // fraction. - bt = math.Exp(lgamma(a+b) - lgamma(a) - lgamma(b) + - a*math.Log(x) + b*math.Log(1-x)) - } - if x < (a+1)/(a+b+2) { - // Compute continued fraction directly. - return bt * betacf(x, a, b) / a - } else { - // Compute continued fraction after symmetry transform. - return 1 - bt*betacf(1-x, b, a)/b - } -} - -// betacf is the continued fraction component of the regularized -// incomplete beta function Iₓ(a, b). -func betacf(x, a, b float64) float64 { - const maxIterations = 200 - const epsilon = 3e-14 - - raiseZero := func(z float64) float64 { - if math.Abs(z) < math.SmallestNonzeroFloat64 { - return math.SmallestNonzeroFloat64 - } - return z - } - - c := 1.0 - d := 1 / raiseZero(1-(a+b)*x/(a+1)) - h := d - for m := 1; m <= maxIterations; m++ { - mf := float64(m) - - // Even step of the recurrence. - numer := mf * (b - mf) * x / ((a + 2*mf - 1) * (a + 2*mf)) - d = 1 / raiseZero(1+numer*d) - c = raiseZero(1 + numer/c) - h *= d * c - - // Odd step of the recurrence. - numer = -(a + mf) * (a + b + mf) * x / ((a + 2*mf) * (a + 2*mf + 1)) - d = 1 / raiseZero(1+numer*d) - c = raiseZero(1 + numer/c) - hfac := d * c - h *= hfac - - if math.Abs(hfac-1) < epsilon { - return h - } - } - panic("betainc: a or b too big; failed to converge") -}
diff --git a/perf/go/stats/beta_test.go b/perf/go/stats/beta_test.go deleted file mode 100644 index 760380f..0000000 --- a/perf/go/stats/beta_test.go +++ /dev/null
@@ -1,31 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "testing" - - "go.skia.org/infra/go/testutils/unittest" -) - -func TestBetaInc(t *testing.T) { - unittest.SmallTest(t) - // Example values from MATLAB betainc documentation. - testFunc(t, "I_0.5(%v, 3)", - func(a float64) float64 { return mathBetaInc(0.5, a, 3) }, - map[float64]float64{ - 0: 1.00000000000000, - 1: 0.87500000000000, - 2: 0.68750000000000, - 3: 0.50000000000000, - 4: 0.34375000000000, - 5: 0.22656250000000, - 6: 0.14453125000000, - 7: 0.08984375000000, - 8: 0.05468750000000, - 9: 0.03271484375000, - 10: 0.01928710937500, - }) -}
diff --git a/perf/go/stats/deltadist.go b/perf/go/stats/deltadist.go deleted file mode 100644 index bb3ba3f..0000000 --- a/perf/go/stats/deltadist.go +++ /dev/null
@@ -1,57 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -// DeltaDist is the Dirac delta function, centered at T, with total -// area 1. -// -// The CDF of the Dirac delta function is the Heaviside step function, -// centered at T. Specifically, f(T) == 1. -type DeltaDist struct { - T float64 -} - -func (d DeltaDist) PDF(x float64) float64 { - if x == d.T { - return inf - } - return 0 -} - -func (d DeltaDist) pdfEach(xs []float64) []float64 { - res := make([]float64, len(xs)) - for i, x := range xs { - if x == d.T { - res[i] = inf - } - } - return res -} - -func (d DeltaDist) CDF(x float64) float64 { - if x >= d.T { - return 1 - } - return 0 -} - -func (d DeltaDist) cdfEach(xs []float64) []float64 { - res := make([]float64, len(xs)) - for i, x := range xs { - res[i] = d.CDF(x) - } - return res -} - -func (d DeltaDist) InvCDF(y float64) float64 { - if y < 0 || y > 1 { - return nan - } - return d.T -} - -func (d DeltaDist) Bounds() (float64, float64) { - return d.T - 1, d.T + 1 -}
diff --git a/perf/go/stats/dist.go b/perf/go/stats/dist.go deleted file mode 100644 index 048477d..0000000 --- a/perf/go/stats/dist.go +++ /dev/null
@@ -1,210 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import "math/rand" - -// A DistCommon is a statistical distribution. DistCommon is a base -// interface provided by both continuous and discrete distributions. -type DistCommon interface { - // CDF returns the cumulative probability Pr[X <= x]. - // - // For continuous distributions, the CDF is the integral of - // the PDF from -inf to x. - // - // For discrete distributions, the CDF is the sum of the PMF - // at all defined points from -inf to x, inclusive. Note that - // the CDF of a discrete distribution is defined for the whole - // real line (unlike the PMF) but has discontinuities where - // the PMF is non-zero. - // - // The CDF is a monotonically increasing function and has a - // domain of all real numbers. If the distribution has bounded - // support, it has a range of [0, 1]; otherwise it has a range - // of (0, 1). Finally, CDF(-inf)==0 and CDF(inf)==1. - CDF(x float64) float64 - - // Bounds returns reasonable bounds for this distribution's - // PDF/PMF and CDF. The total weight outside of these bounds - // should be approximately 0. - // - // For a discrete distribution, both bounds are integer - // multiples of Step(). - // - // If this distribution has finite support, it returns exact - // bounds l, h such that CDF(l')=0 for all l' < l and - // CDF(h')=1 for all h' >= h. - Bounds() (float64, float64) -} - -// A Dist is a continuous statistical distribution. -type Dist interface { - DistCommon - - // PDF returns the value of the probability density function - // of this distribution at x. - PDF(x float64) float64 -} - -// A DiscreteDist is a discrete statistical distribution. -// -// Most discrete distributions are defined only at integral values of -// the random variable. However, some are defined at other intervals, -// so this interface takes a float64 value for the random variable. -// The probability mass function rounds down to the nearest defined -// point. Note that float64 values can exactly represent integer -// values between ±2**53, so this generally shouldn't be an issue for -// integer-valued distributions (likewise, for half-integer-valued -// distributions, float64 can exactly represent all values between -// ±2**52). -type DiscreteDist interface { - DistCommon - - // PMF returns the value of the probability mass function - // Pr[X = x'], where x' is x rounded down to the nearest - // defined point on the distribution. - // - // Note for implementers: for integer-valued distributions, - // round x using int(math.Floor(x)). Do not use int(x), since - // that truncates toward zero (unless all x <= 0 are handled - // the same). - PMF(x float64) float64 - - // Step returns s, where the distribution is defined for sℕ. - Step() float64 -} - -// TODO: Add a Support method for finite support distributions? Or -// maybe just another return value from Bounds indicating that the -// bounds are exact? - -// TODO: Plot method to return a pre-configured Plot object with -// reasonable bounds and an integral function? Have to distinguish -// PDF/CDF/InvCDF. Three methods? Argument? -// -// Doesn't have to be a method of Dist. Could be just a function that -// takes a Dist and uses Bounds. - -// InvCDF returns the inverse CDF function of the given distribution -// (also known as the quantile function or the percent point -// function). This is a function f such that f(dist.CDF(x)) == x. If -// dist.CDF is only weakly monotonic (that it, there are intervals -// over which it is constant) and y > 0, f returns the smallest x that -// satisfies this condition. In general, the inverse CDF is not -// well-defined for y==0, but for convenience if y==0, f returns the -// largest x that satisfies this condition. For distributions with -// infinite support both the largest and smallest x are -Inf; however, -// for distributions with finite support, this is the lower bound of -// the support. -// -// If y < 0 or y > 1, f returns NaN. -// -// If dist implements InvCDF(float64) float64, this returns that -// method. Otherwise, it returns a function that uses a generic -// numerical method to construct the inverse CDF at y by finding x -// such that dist.CDF(x) == y. This may have poor precision around -// points of discontinuity, including f(0) and f(1). -func InvCDF(dist DistCommon) func(y float64) (x float64) { - type invCDF interface { - InvCDF(float64) float64 - } - if dist, ok := dist.(invCDF); ok { - return dist.InvCDF - } - - // Otherwise, use a numerical algorithm. - // - // TODO: For discrete distributions, use the step size to - // inform this computation. - return func(y float64) (x float64) { - const almostInf = 1e100 - const xtol = 1e-16 - - if y < 0 || y > 1 { - return nan - } else if y == 0 { - l, _ := dist.Bounds() - if dist.CDF(l) == 0 { - // Finite support - return l - } else { - // Infinite support - return -inf - } - } else if y == 1 { - _, h := dist.Bounds() - if dist.CDF(h) == 1 { - // Finite support - return h - } else { - // Infinite support - return inf - } - } - - // Find loX, hiX for which cdf(loX) < y <= cdf(hiX). - var loX, loY, hiX, hiY float64 - x1, y1 := 0.0, dist.CDF(0) - xdelta := 1.0 - if y1 < y { - hiX, hiY = x1, y1 - for hiY < y && hiX != inf { - loX, loY, hiX = hiX, hiY, hiX+xdelta - hiY = dist.CDF(hiX) - xdelta *= 2 - } - } else { - loX, loY = x1, y1 - for y <= loY && loX != -inf { - hiX, hiY, loX = loX, loY, loX-xdelta - loY = dist.CDF(loX) - xdelta *= 2 - } - } - if loX == -inf { - return loX - } else if hiX == inf { - return hiX - } - - // Use bisection on the interval to find the smallest - // x at which cdf(x) <= y. - _, x = bisectBool(func(x float64) bool { - return dist.CDF(x) < y - }, loX, hiX, xtol) - return - } -} - -// Rand returns a random number generator that draws from the given -// distribution. The returned generator takes an optional source of -// randomness; if this is nil, it uses the default global source. -// -// If dist implements Rand(*rand.Rand) float64, Rand returns that -// method. Otherwise, it returns a generic generator based on dist's -// inverse CDF (which may in turn use an efficient implementation or a -// generic numerical implementation; see InvCDF). -func Rand(dist DistCommon) func(*rand.Rand) float64 { - type distRand interface { - Rand(*rand.Rand) float64 - } - if dist, ok := dist.(distRand); ok { - return dist.Rand - } - - // Otherwise, use a generic algorithm. - inv := InvCDF(dist) - return func(r *rand.Rand) float64 { - var y float64 - for y == 0 { - if r == nil { - y = rand.Float64() - } else { - y = r.Float64() - } - } - return inv(y) - } -}
diff --git a/perf/go/stats/mathx.go b/perf/go/stats/mathx.go deleted file mode 100644 index 68a9a0a..0000000 --- a/perf/go/stats/mathx.go +++ /dev/null
@@ -1,75 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import "math" - -// mathSign returns the sign of x: -1 if x < 0, 0 if x == 0, 1 if x > 0. -// If x is NaN, it returns NaN. -func mathSign(x float64) float64 { - if x == 0 { - return 0 - } else if x < 0 { - return -1 - } else if x > 0 { - return 1 - } - return nan -} - -const smallFactLimit = 20 // 20! => 62 bits -var smallFact [smallFactLimit + 1]int64 - -func init() { - smallFact[0] = 1 - fact := int64(1) - for n := int64(1); n <= smallFactLimit; n++ { - fact *= n - smallFact[n] = fact - } -} - -// mathChoose returns the binomial coefficient of n and k. -func mathChoose(n, k int) float64 { - if k == 0 || k == n { - return 1 - } - if k < 0 || n < k { - return 0 - } - if n <= smallFactLimit { // Implies k <= smallFactLimit - // It's faster to do several integer multiplications - // than it is to do an extra integer division. - // Remarkably, this is also faster than pre-computing - // Pascal's triangle (presumably because this is very - // cache efficient). - numer := int64(1) - for n1 := int64(n - (k - 1)); n1 <= int64(n); n1++ { - numer *= n1 - } - denom := smallFact[k] - return float64(numer / denom) - } - - return math.Exp(lchoose(n, k)) -} - -// mathLchoose returns math.Log(mathChoose(n, k)). -func mathLchoose(n, k int) float64 { - if k == 0 || k == n { - return 0 - } - if k < 0 || n < k { - return math.NaN() - } - return lchoose(n, k) -} - -func lchoose(n, k int) float64 { - a, _ := math.Lgamma(float64(n + 1)) - b, _ := math.Lgamma(float64(k + 1)) - c, _ := math.Lgamma(float64(n - k + 1)) - return a - b - c -}
diff --git a/perf/go/stats/normaldist.go b/perf/go/stats/normaldist.go deleted file mode 100644 index d00f96a..0000000 --- a/perf/go/stats/normaldist.go +++ /dev/null
@@ -1,141 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "math" - "math/rand" -) - -// NormalDist is a normal (Gaussian) distribution with mean Mu and -// standard deviation Sigma. -type NormalDist struct { - Mu, Sigma float64 -} - -// StdNormal is the standard normal distribution (Mu = 0, Sigma = 1) -var StdNormal = NormalDist{0, 1} - -// 1/sqrt(2 * pi) -const invSqrt2Pi = 0.39894228040143267793994605993438186847585863116493465766592583 - -func (n NormalDist) PDF(x float64) float64 { - z := x - n.Mu - return math.Exp(-z*z/(2*n.Sigma*n.Sigma)) * invSqrt2Pi / n.Sigma -} - -func (n NormalDist) pdfEach(xs []float64) []float64 { - res := make([]float64, len(xs)) - if n.Mu == 0 && n.Sigma == 1 { - // Standard normal fast path - for i, x := range xs { - res[i] = math.Exp(-x*x/2) * invSqrt2Pi - } - } else { - a := -1 / (2 * n.Sigma * n.Sigma) - b := invSqrt2Pi / n.Sigma - for i, x := range xs { - z := x - n.Mu - res[i] = math.Exp(z*z*a) * b - } - } - return res -} - -func (n NormalDist) CDF(x float64) float64 { - return math.Erfc(-(x-n.Mu)/(n.Sigma*math.Sqrt2)) / 2 -} - -func (n NormalDist) cdfEach(xs []float64) []float64 { - res := make([]float64, len(xs)) - a := 1 / (n.Sigma * math.Sqrt2) - for i, x := range xs { - res[i] = math.Erfc(-(x-n.Mu)*a) / 2 - } - return res -} - -func (n NormalDist) InvCDF(p float64) (x float64) { - // This is based on Peter John Acklam's inverse normal CDF - // algorithm: http://home.online.no/~pjacklam/notes/invnorm/ - const ( - a1 = -3.969683028665376e+01 - a2 = 2.209460984245205e+02 - a3 = -2.759285104469687e+02 - a4 = 1.383577518672690e+02 - a5 = -3.066479806614716e+01 - a6 = 2.506628277459239e+00 - - b1 = -5.447609879822406e+01 - b2 = 1.615858368580409e+02 - b3 = -1.556989798598866e+02 - b4 = 6.680131188771972e+01 - b5 = -1.328068155288572e+01 - - c1 = -7.784894002430293e-03 - c2 = -3.223964580411365e-01 - c3 = -2.400758277161838e+00 - c4 = -2.549732539343734e+00 - c5 = 4.374664141464968e+00 - c6 = 2.938163982698783e+00 - - d1 = 7.784695709041462e-03 - d2 = 3.224671290700398e-01 - d3 = 2.445134137142996e+00 - d4 = 3.754408661907416e+00 - - plow = 0.02425 - phigh = 1 - plow - ) - - if p < 0 || p > 1 { - return nan - } else if p == 0 { - return -inf - } else if p == 1 { - return inf - } - - if p < plow { - // Rational approximation for lower region. - q := math.Sqrt(-2 * math.Log(p)) - x = (((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q + c6) / - ((((d1*q+d2)*q+d3)*q+d4)*q + 1) - } else if phigh < p { - // Rational approximation for upper region. - q := math.Sqrt(-2 * math.Log(1-p)) - x = -(((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q + c6) / - ((((d1*q+d2)*q+d3)*q+d4)*q + 1) - } else { - // Rational approximation for central region. - q := p - 0.5 - r := q * q - x = (((((a1*r+a2)*r+a3)*r+a4)*r+a5)*r + a6) * q / - (((((b1*r+b2)*r+b3)*r+b4)*r+b5)*r + 1) - } - - // Refine approximation. - e := 0.5*math.Erfc(-x/math.Sqrt2) - p - u := e * math.Sqrt(2*math.Pi) * math.Exp(x*x/2) - x = x - u/(1+x*u/2) - - // Adjust from standard normal. - return x*n.Sigma + n.Mu -} - -func (n NormalDist) Rand(r *rand.Rand) float64 { - var x float64 - if r == nil { - x = rand.NormFloat64() - } else { - x = r.NormFloat64() - } - return x*n.Sigma + n.Mu -} - -func (n NormalDist) Bounds() (float64, float64) { - const stddevs = 3 - return n.Mu - stddevs*n.Sigma, n.Mu + stddevs*n.Sigma -}
diff --git a/perf/go/stats/normaldist_test.go b/perf/go/stats/normaldist_test.go deleted file mode 100644 index 1e83c32..0000000 --- a/perf/go/stats/normaldist_test.go +++ /dev/null
@@ -1,36 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "fmt" - "math" - "testing" - - "go.skia.org/infra/go/testutils/unittest" -) - -func TestNormalDist(t *testing.T) { - unittest.SmallTest(t) - d := StdNormal - - testFunc(t, fmt.Sprintf("%+v.PDF", d), d.PDF, map[float64]float64{ - -10000: 0, // approx - -1: 1 / math.Sqrt(2*math.Pi) * math.Exp(-0.5), - 0: 1 / math.Sqrt(2*math.Pi), - 1: 1 / math.Sqrt(2*math.Pi) * math.Exp(-0.5), - 10000: 0, // approx - }) - - testFunc(t, fmt.Sprintf("%+v.CDF", d), d.CDF, map[float64]float64{ - -10000: 0, // approx - 0: 0.5, - 10000: 1, // approx - }) - - d2 := NormalDist{Mu: 2, Sigma: 5} - testInvCDF(t, d, false) - testInvCDF(t, d2, false) -}
diff --git a/perf/go/stats/package.go b/perf/go/stats/package.go deleted file mode 100644 index d0b291e..0000000 --- a/perf/go/stats/package.go +++ /dev/null
@@ -1,27 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package stats implements several statistical distributions, -// hypothesis tests, and functions for descriptive statistics. -// -// Currently stats is fairly small, but for what it does implement, it -// focuses on high quality, fast implementations with good, idiomatic -// Go APIs. -// -// This is a trimmed fork of github.com/aclements/go-moremath/stats. -package stats - -import ( - "errors" - "math" -) - -var inf = math.Inf(1) -var nan = math.NaN() - -// TODO: Put all errors in the same place and maybe unify them. - -var ( - ErrSamplesEqual = errors.New("all samples are equal") -)
diff --git a/perf/go/stats/sample.go b/perf/go/stats/sample.go deleted file mode 100644 index d6fe8a6..0000000 --- a/perf/go/stats/sample.go +++ /dev/null
@@ -1,340 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "math" - "sort" -) - -// Sample is a collection of possibly weighted data points. -type Sample struct { - // Xs is the slice of sample values. - Xs []float64 - - // Weights[i] is the weight of sample Xs[i]. If Weights is - // nil, all Xs have weight 1. Weights must have the same - // length of Xs and all values must be non-negative. - Weights []float64 - - // Sorted indicates that Xs is sorted in ascending order. - Sorted bool -} - -// Bounds returns the minimum and maximum values of xs. -func Bounds(xs []float64) (min float64, max float64) { - if len(xs) == 0 { - return math.NaN(), math.NaN() - } - min, max = xs[0], xs[0] - for _, x := range xs { - if x < min { - min = x - } - if x > max { - max = x - } - } - return -} - -// Bounds returns the minimum and maximum values of the Sample. -// -// If the Sample is weighted, this ignores samples with zero weight. -// -// This is constant time if s.Sorted and there are no zero-weighted -// values. -func (s Sample) Bounds() (min float64, max float64) { - if len(s.Xs) == 0 || (!s.Sorted && s.Weights == nil) { - return Bounds(s.Xs) - } - - if s.Sorted { - if s.Weights == nil { - return s.Xs[0], s.Xs[len(s.Xs)-1] - } - min, max = math.NaN(), math.NaN() - for i, w := range s.Weights { - if w != 0 { - min = s.Xs[i] - break - } - } - if math.IsNaN(min) { - return - } - for i := range s.Weights { - if s.Weights[len(s.Weights)-i-1] != 0 { - max = s.Xs[len(s.Weights)-i-1] - break - } - } - } else { - min, max = math.Inf(1), math.Inf(-1) - for i, x := range s.Xs { - w := s.Weights[i] - if x < min && w != 0 { - min = x - } - if x > max && w != 0 { - max = x - } - } - if math.IsInf(min, 0) { - min, max = math.NaN(), math.NaN() - } - } - return -} - -// vecSum returns the sum of xs. -func vecSum(xs []float64) float64 { - sum := 0.0 - for _, x := range xs { - sum += x - } - return sum -} - -// Sum returns the (possibly weighted) sum of the Sample. -func (s Sample) Sum() float64 { - if s.Weights == nil { - return vecSum(s.Xs) - } - sum := 0.0 - for i, x := range s.Xs { - sum += x * s.Weights[i] - } - return sum -} - -// Weight returns the total weight of the Sasmple. -func (s Sample) Weight() float64 { - if s.Weights == nil { - return float64(len(s.Xs)) - } - return vecSum(s.Weights) -} - -// Mean returns the arithmetic mean of xs. -func Mean(xs []float64) float64 { - if len(xs) == 0 { - return math.NaN() - } - m := 0.0 - for i, x := range xs { - m += (x - m) / float64(i+1) - } - return m -} - -// Mean returns the arithmetic mean of the Sample. -func (s Sample) Mean() float64 { - if len(s.Xs) == 0 || s.Weights == nil { - return Mean(s.Xs) - } - - m, wsum := 0.0, 0.0 - for i, x := range s.Xs { - // Use weighted incremental mean: - // m_i = (1 - w_i/wsum_i) * m_(i-1) + (w_i/wsum_i) * x_i - // = m_(i-1) + (x_i - m_(i-1)) * (w_i/wsum_i) - w := s.Weights[i] - wsum += w - m += (x - m) * w / wsum - } - return m -} - -// GeoMean returns the geometric mean of xs. xs must be positive. -func GeoMean(xs []float64) float64 { - if len(xs) == 0 { - return math.NaN() - } - m := 0.0 - for i, x := range xs { - if x <= 0 { - return math.NaN() - } - lx := math.Log(x) - m += (lx - m) / float64(i+1) - } - return math.Exp(m) -} - -// GeoMean returns the geometric mean of the Sample. All samples -// values must be positive. -func (s Sample) GeoMean() float64 { - if len(s.Xs) == 0 || s.Weights == nil { - return GeoMean(s.Xs) - } - - m, wsum := 0.0, 0.0 - for i, x := range s.Xs { - w := s.Weights[i] - wsum += w - lx := math.Log(x) - m += (lx - m) * w / wsum - } - return math.Exp(m) -} - -// Variance returns the sample variance of xs. -func Variance(xs []float64) float64 { - if len(xs) == 0 { - return math.NaN() - } else if len(xs) <= 1 { - return 0 - } - - // Based on Wikipedia's presentation of Welford 1962 - // (http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm). - // This is more numerically stable than the standard two-pass - // formula and not prone to massive cancellation. - mean, M2 := 0.0, 0.0 - for n, x := range xs { - delta := x - mean - mean += delta / float64(n+1) - M2 += delta * (x - mean) - } - return M2 / float64(len(xs)-1) -} - -func (s Sample) Variance() float64 { - if len(s.Xs) == 0 || s.Weights == nil { - return Variance(s.Xs) - } - // TODO(austin) - panic("Weighted Variance not implemented") -} - -// StdDev returns the sample standard deviation of xs. -func StdDev(xs []float64) float64 { - return math.Sqrt(Variance(xs)) -} - -// StdDev returns the sample standard deviation of the Sample. -func (s Sample) StdDev() float64 { - if len(s.Xs) == 0 || s.Weights == nil { - return StdDev(s.Xs) - } - // TODO(austin) - panic("Weighted StdDev not implemented") -} - -// Percentile returns the pctileth value from the Sample. This uses -// interpolation method R8 from Hyndman and Fan (1996). -// -// pctile will be capped to the range [0, 1]. If len(xs) == 0 or all -// weights are 0, returns NaN. -// -// Percentile(0.5) is the median. Percentile(0.25) and -// Percentile(0.75) are the first and third quartiles, respectively. -// -// This is constant time if s.Sorted and s.Weights == nil. -func (s Sample) Percentile(pctile float64) float64 { - if len(s.Xs) == 0 { - return math.NaN() - } else if pctile <= 0 { - min, _ := s.Bounds() - return min - } else if pctile >= 1 { - _, max := s.Bounds() - return max - } - - if !s.Sorted { - // TODO(austin) Use select algorithm instead - s = *s.Copy().Sort() - } - - if s.Weights == nil { - N := float64(len(s.Xs)) - //n := pctile * (N + 1) // R6 - n := 1/3.0 + pctile*(N+1/3.0) // R8 - kf, frac := math.Modf(n) - k := int(kf) - if k <= 0 { - return s.Xs[0] - } else if k >= len(s.Xs) { - return s.Xs[len(s.Xs)-1] - } - return s.Xs[k-1] + frac*(s.Xs[k]-s.Xs[k-1]) - } else { - // TODO(austin): Implement interpolation - - target := s.Weight() * pctile - - // TODO(austin) If we had cumulative weights, we could - // do this in log time. - for i, weight := range s.Weights { - target -= weight - if target < 0 { - return s.Xs[i] - } - } - return s.Xs[len(s.Xs)-1] - } -} - -// IQR returns the interquartile range of the Sample. -// -// This is constant time if s.Sorted and s.Weights == nil. -func (s Sample) IQR() float64 { - if !s.Sorted { - s = *s.Copy().Sort() - } - return s.Percentile(0.75) - s.Percentile(0.25) -} - -type sampleSorter struct { - xs []float64 - weights []float64 -} - -func (p *sampleSorter) Len() int { - return len(p.xs) -} - -func (p *sampleSorter) Less(i, j int) bool { - return p.xs[i] < p.xs[j] -} - -func (p *sampleSorter) Swap(i, j int) { - p.xs[i], p.xs[j] = p.xs[j], p.xs[i] - p.weights[i], p.weights[j] = p.weights[j], p.weights[i] -} - -// Sort sorts the samples in place in s and returns s. -// -// A sorted sample improves the performance of some algorithms. -func (s *Sample) Sort() *Sample { - if s.Sorted || sort.Float64sAreSorted(s.Xs) { - // All set - } else if s.Weights == nil { - sort.Float64s(s.Xs) - } else { - sort.Sort(&sampleSorter{s.Xs, s.Weights}) - } - s.Sorted = true - return s -} - -// Copy returns a copy of the Sample. -// -// The returned Sample shares no data with the original, so they can -// be modified (for example, sorted) independently. -func (s Sample) Copy() *Sample { - xs := make([]float64, len(s.Xs)) - copy(xs, s.Xs) - - weights := []float64(nil) - if s.Weights != nil { - weights = make([]float64, len(s.Weights)) - copy(weights, s.Weights) - } - - return &Sample{xs, weights, s.Sorted} -}
diff --git a/perf/go/stats/sample_test.go b/perf/go/stats/sample_test.go deleted file mode 100644 index d7d0118..0000000 --- a/perf/go/stats/sample_test.go +++ /dev/null
@@ -1,26 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "testing" - - "go.skia.org/infra/go/testutils/unittest" -) - -func TestSamplePercentile(t *testing.T) { - unittest.SmallTest(t) - s := Sample{Xs: []float64{15, 20, 35, 40, 50}} - testFunc(t, "Percentile", s.Percentile, map[float64]float64{ - -1: 15, - 0: 15, - .05: 15, - .30: 19.666666666666666, - .40: 27, - .95: 50, - 1: 50, - 2: 50, - }) -}
diff --git a/perf/go/stats/tdist.go b/perf/go/stats/tdist.go deleted file mode 100644 index 1d1c704..0000000 --- a/perf/go/stats/tdist.go +++ /dev/null
@@ -1,33 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import "math" - -// A TDist is a Student's t-distribution with V degrees of freedom. -type TDist struct { - V float64 -} - -func (t TDist) PDF(x float64) float64 { - return math.Exp(lgamma((t.V+1)/2)-lgamma(t.V/2)) / - math.Sqrt(t.V*math.Pi) * math.Pow(1+(x*x)/t.V, -(t.V+1)/2) -} - -func (t TDist) CDF(x float64) float64 { - if x == 0 { - return 0.5 - } else if x > 0 { - return 1 - 0.5*mathBetaInc(t.V/(t.V+x*x), t.V/2, 0.5) - } else if x < 0 { - return 1 - t.CDF(-x) - } else { - return math.NaN() - } -} - -func (t TDist) Bounds() (float64, float64) { - return -4, 4 -}
diff --git a/perf/go/stats/tdist_test.go b/perf/go/stats/tdist_test.go deleted file mode 100644 index 7915631..0000000 --- a/perf/go/stats/tdist_test.go +++ /dev/null
@@ -1,100 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "testing" - - "go.skia.org/infra/go/testutils/unittest" -) - -func TestT(t *testing.T) { - unittest.SmallTest(t) - testFunc(t, "PDF(%v|v=1)", TDist{1}.PDF, map[float64]float64{ - -10: 0.0031515830315226806, - -9: 0.0038818278802901312, - -8: 0.0048970751720583188, - -7: 0.0063661977236758151, - -6: 0.0086029698968592104, - -5: 0.012242687930145799, - -4: 0.018724110951987692, - -3: 0.031830988618379075, - -2: 0.063661977236758149, - -1: 0.15915494309189537, - 0: 0.31830988618379075, - 1: 0.15915494309189537, - 2: 0.063661977236758149, - 3: 0.031830988618379075, - 4: 0.018724110951987692, - 5: 0.012242687930145799, - 6: 0.0086029698968592104, - 7: 0.0063661977236758151, - 8: 0.0048970751720583188, - 9: 0.0038818278802901312}) - testFunc(t, "PDF(%v|v=5)", TDist{5}.PDF, map[float64]float64{ - -10: 4.0989816415343313e-05, - -9: 7.4601664362590413e-05, - -8: 0.00014444303269563934, - -7: 0.00030134402928803911, - -6: 0.00068848154013743002, - -5: 0.0017574383788078445, - -4: 0.0051237270519179133, - -3: 0.017292578800222964, - -2: 0.065090310326216455, - -1: 0.21967979735098059, - 0: 0.3796066898224944, - 1: 0.21967979735098059, - 2: 0.065090310326216455, - 3: 0.017292578800222964, - 4: 0.0051237270519179133, - 5: 0.0017574383788078445, - 6: 0.00068848154013743002, - 7: 0.00030134402928803911, - 8: 0.00014444303269563934, - 9: 7.4601664362590413e-05}) - - testFunc(t, "CDF(%v|v=1)", TDist{1}.CDF, map[float64]float64{ - -10: 0.03172551743055356, - -9: 0.035223287477277272, - -8: 0.039583424160565539, - -7: 0.045167235300866547, - -6: 0.052568456711253424, - -5: 0.06283295818900117, - -4: 0.077979130377369324, - -3: 0.10241638234956672, - -2: 0.14758361765043321, - -1: 0.24999999999999978, - 0: 0.5, - 1: 0.75000000000000022, - 2: 0.85241638234956674, - 3: 0.89758361765043326, - 4: 0.92202086962263075, - 5: 0.93716704181099886, - 6: 0.94743154328874657, - 7: 0.95483276469913347, - 8: 0.96041657583943452, - 9: 0.96477671252272279}) - testFunc(t, "CDF(%v|v=5)", TDist{5}.CDF, map[float64]float64{ - -10: 8.5473787871481787e-05, - -9: 0.00014133998712194845, - -8: 0.00024645333028622187, - -7: 0.00045837375719920225, - -6: 0.00092306914479700695, - -5: 0.0020523579900266612, - -4: 0.0051617077404157259, - -3: 0.015049623948731284, - -2: 0.05096973941492914, - -1: 0.18160873382456127, - 0: 0.5, - 1: 0.81839126617543867, - 2: 0.9490302605850709, - 3: 0.98495037605126878, - 4: 0.99483829225958431, - 5: 0.99794764200997332, - 6: 0.99907693085520299, - 7: 0.99954162624280074, - 8: 0.99975354666971372, - 9: 0.9998586600128780}) -}
diff --git a/perf/go/stats/ttest.go b/perf/go/stats/ttest.go deleted file mode 100644 index 8742298..0000000 --- a/perf/go/stats/ttest.go +++ /dev/null
@@ -1,147 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "errors" - "math" -) - -// A TTestResult is the result of a t-test. -type TTestResult struct { - // N1 and N2 are the sizes of the input samples. For a - // one-sample t-test, N2 is 0. - N1, N2 int - - // T is the value of the t-statistic for this t-test. - T float64 - - // DoF is the degrees of freedom for this t-test. - DoF float64 - - // AltHypothesis specifies the alternative hypothesis tested - // by this test against the null hypothesis that there is no - // difference in the means of the samples. - AltHypothesis LocationHypothesis - - // P is p-value for this t-test for the given null hypothesis. - P float64 -} - -func newTTestResult(n1, n2 int, t, dof float64, alt LocationHypothesis) *TTestResult { - dist := TDist{dof} - var p float64 - switch alt { - case LocationDiffers: - p = 2 * (1 - dist.CDF(math.Abs(t))) - case LocationLess: - p = dist.CDF(t) - case LocationGreater: - p = 1 - dist.CDF(t) - } - return &TTestResult{N1: n1, N2: n2, T: t, DoF: dof, AltHypothesis: alt, P: p} -} - -// A TTestSample is a sample that can be used for a one or two sample -// t-test. -type TTestSample interface { - Weight() float64 - Mean() float64 - Variance() float64 -} - -var ( - ErrSampleSize = errors.New("sample is too small") - ErrZeroVariance = errors.New("sample has zero variance") - ErrMismatchedSamples = errors.New("samples have different lengths") -) - -// TwoSampleTTest performs a two-sample (unpaired) Student's t-test on -// samples x1 and x2. This is a test of the null hypothesis that x1 -// and x2 are drawn from populations with equal means. It assumes x1 -// and x2 are independent samples, that the distributions have equal -// variance, and that the populations are normally distributed. -func TwoSampleTTest(x1, x2 TTestSample, alt LocationHypothesis) (*TTestResult, error) { - n1, n2 := x1.Weight(), x2.Weight() - if n1 == 0 || n2 == 0 { - return nil, ErrSampleSize - } - v1, v2 := x1.Variance(), x2.Variance() - if v1 == 0 && v2 == 0 { - return nil, ErrZeroVariance - } - - dof := n1 + n2 - 2 - v12 := ((n1-1)*v1 + (n2-1)*v2) / dof - t := (x1.Mean() - x2.Mean()) / math.Sqrt(v12*(1/n1+1/n2)) - return newTTestResult(int(n1), int(n2), t, dof, alt), nil -} - -// TwoSampleWelchTTest performs a two-sample (unpaired) Welch's t-test -// on samples x1 and x2. This is like TwoSampleTTest, but does not -// assume the distributions have equal variance. -func TwoSampleWelchTTest(x1, x2 TTestSample, alt LocationHypothesis) (*TTestResult, error) { - n1, n2 := x1.Weight(), x2.Weight() - if n1 <= 1 || n2 <= 1 { - // TODO: Can we still do this with n == 1? - return nil, ErrSampleSize - } - v1, v2 := x1.Variance(), x2.Variance() - if v1 == 0 && v2 == 0 { - return nil, ErrZeroVariance - } - - dof := math.Pow(v1/n1+v2/n2, 2) / - (math.Pow(v1/n1, 2)/(n1-1) + math.Pow(v2/n2, 2)/(n2-1)) - s := math.Sqrt(v1/n1 + v2/n2) - t := (x1.Mean() - x2.Mean()) / s - return newTTestResult(int(n1), int(n2), t, dof, alt), nil -} - -// PairedTTest performs a two-sample paired t-test on samples x1 and -// x2. If μ0 is non-zero, this tests if the average of the difference -// is significantly different from μ0. If x1 and x2 are identical, -// this returns nil. -func PairedTTest(x1, x2 []float64, μ0 float64, alt LocationHypothesis) (*TTestResult, error) { - if len(x1) != len(x2) { - return nil, ErrMismatchedSamples - } - if len(x1) <= 1 { - // TODO: Can we still do this with n == 1? - return nil, ErrSampleSize - } - - dof := float64(len(x1) - 1) - - diff := make([]float64, len(x1)) - for i := range x1 { - diff[i] = x1[i] - x2[i] - } - sd := StdDev(diff) - if sd == 0 { - // TODO: Can we still do the test? - return nil, ErrZeroVariance - } - t := (Mean(diff) - μ0) * math.Sqrt(float64(len(x1))) / sd - return newTTestResult(len(x1), len(x2), t, dof, alt), nil -} - -// OneSampleTTest performs a one-sample t-test on sample x. This tests -// the null hypothesis that the population mean is equal to μ0. This -// assumes the distribution of the population of sample means is -// normal. -func OneSampleTTest(x TTestSample, μ0 float64, alt LocationHypothesis) (*TTestResult, error) { - n, v := x.Weight(), x.Variance() - if n == 0 { - return nil, ErrSampleSize - } - if v == 0 { - // TODO: Can we still do the test? - return nil, ErrZeroVariance - } - dof := n - 1 - t := (x.Mean() - μ0) * math.Sqrt(n) / math.Sqrt(v) - return newTTestResult(int(n), 0, t, dof, alt), nil -}
diff --git a/perf/go/stats/ttest_test.go b/perf/go/stats/ttest_test.go deleted file mode 100644 index 40ffe15..0000000 --- a/perf/go/stats/ttest_test.go +++ /dev/null
@@ -1,76 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "testing" - - "go.skia.org/infra/go/testutils/unittest" -) - -func TestTTest(t *testing.T) { - unittest.SmallTest(t) - s1 := Sample{Xs: []float64{2, 1, 3, 4}} - s2 := Sample{Xs: []float64{6, 5, 7, 9}} - - check := func(want, got *TTestResult) { - if want.N1 != got.N1 || want.N2 != got.N2 || - !aeq(want.T, got.T) || !aeq(want.DoF, got.DoF) || - want.AltHypothesis != got.AltHypothesis || - !aeq(want.P, got.P) { - t.Errorf("want %+v, got %+v", want, got) - } - } - check3 := func(test func(alt LocationHypothesis) (*TTestResult, error), n1, n2 int, t, dof float64, pless, pdiff, pgreater float64) { - want := &TTestResult{N1: n1, N2: n2, T: t, DoF: dof} - - want.AltHypothesis = LocationLess - want.P = pless - got, _ := test(want.AltHypothesis) - check(want, got) - - want.AltHypothesis = LocationDiffers - want.P = pdiff - got, _ = test(want.AltHypothesis) - check(want, got) - - want.AltHypothesis = LocationGreater - want.P = pgreater - got, _ = test(want.AltHypothesis) - check(want, got) - } - - check3(func(alt LocationHypothesis) (*TTestResult, error) { - return TwoSampleTTest(s1, s1, alt) - }, 4, 4, 0, 6, - 0.5, 1, 0.5) - check3(func(alt LocationHypothesis) (*TTestResult, error) { - return TwoSampleWelchTTest(s1, s1, alt) - }, 4, 4, 0, 6, - 0.5, 1, 0.5) - - check3(func(alt LocationHypothesis) (*TTestResult, error) { - return TwoSampleTTest(s1, s2, alt) - }, 4, 4, -3.9703446152237674, 6, - 0.0036820296121056195, 0.0073640592242113214, 0.9963179703878944) - check3(func(alt LocationHypothesis) (*TTestResult, error) { - return TwoSampleWelchTTest(s1, s2, alt) - }, 4, 4, -3.9703446152237674, 5.584615384615385, - 0.004256431565689112, 0.0085128631313781695, 0.9957435684343109) - - check3(func(alt LocationHypothesis) (*TTestResult, error) { - return PairedTTest(s1.Xs, s2.Xs, 0, alt) - }, 4, 4, -17, 3, - 0.0002216717691559955, 0.00044334353831207749, 0.999778328230844) - - check3(func(alt LocationHypothesis) (*TTestResult, error) { - return OneSampleTTest(s1, 0, alt) - }, 4, 0, 3.872983346207417, 3, - 0.9847668541689145, 0.030466291662170977, 0.015233145831085482) - check3(func(alt LocationHypothesis) (*TTestResult, error) { - return OneSampleTTest(s1, 2.5, alt) - }, 4, 0, 0, 3, - 0.5, 1, 0.5) -}
diff --git a/perf/go/stats/udist.go b/perf/go/stats/udist.go deleted file mode 100644 index 26daa95..0000000 --- a/perf/go/stats/udist.go +++ /dev/null
@@ -1,387 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "math" -) - -// A UDist is the discrete probability distribution of the -// Mann-Whitney U statistic for a pair of samples of sizes N1 and N2. -// -// The details of computing this distribution with no ties can be -// found in Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of -// Whether one of Two Random Variables is Stochastically Larger than -// the Other". Annals of Mathematical Statistics 18 (1): 50–60. -// Computing this distribution in the presence of ties is described in -// Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer". -// Journal of the American Statistical Association 61 (315): 772-787 -// and Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney -// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: -// 805-813 (the former paper contains details that are glossed over in -// the latter paper but has mathematical typesetting issues, so it's -// easiest to get the context from the former paper and the details -// from the latter). -type UDist struct { - N1, N2 int - - // T is the count of the number of ties at each rank in the - // input distributions. T may be nil, in which case it is - // assumed there are no ties (which is equivalent to an M+N - // slice of 1s). It must be the case that Sum(T) == M+N. - T []int -} - -// hasTies returns true if d has any tied samples. -func (d UDist) hasTies() bool { - for _, t := range d.T { - if t > 1 { - return true - } - } - return false -} - -// p returns the p_{d.N1,d.N2} function defined by Mann, Whitney 1947 -// for values of U from 0 up to and including the U argument. -// -// This algorithm runs in Θ(N1*N2*U) = O(N1²N2²) time and is quite -// fast for small values of N1 and N2. However, it does not handle ties. -func (d UDist) p(U int) []float64 { - // This is a dynamic programming implementation of the - // recursive recurrence definition given by Mann and Whitney: - // - // p_{n,m}(U) = (n * p_{n-1,m}(U-m) + m * p_{n,m-1}(U)) / (n+m) - // p_{n,m}(U) = 0 if U < 0 - // p_{0,m}(U) = p{n,0}(U) = 1 / nCr(m+n, n) if U = 0 - // = 0 if U > 0 - // - // (Note that there is a typo in the original paper. The first - // recursive application of p should be for U-m, not U-M.) - // - // Since p{n,m} only depends on p{n-1,m} and p{n,m-1}, we only - // need to store one "plane" of the three dimensional space at - // a time. - // - // Furthermore, p_{n,m} = p_{m,n}, so we only construct values - // for n <= m and obtain the rest through symmetry. - // - // We organize the computed values of p as followed: - // - // n → N - // m * - // ↓ * * - // * * * - // * * * * - // * * * * - // M * * * * - // - // where each * is a slice indexed by U. The code below - // computes these left-to-right, top-to-bottom, so it only - // stores one row of this matrix at a time. Furthermore, - // computing an element in a given U slice only depends on the - // same and smaller values of U, so we can overwrite the U - // slice we're computing in place as long as we start with the - // largest value of U. Finally, even though the recurrence - // depends on (n,m) above the diagonal and we use symmetry to - // mirror those across the diagonal to (m,n), the mirrored - // indexes are always available in the current row, so this - // mirroring does not interfere with our ability to recycle - // state. - - N, M := d.N1, d.N2 - if N > M { - N, M = M, N - } - - memo := make([][]float64, N+1) - for n := range memo { - memo[n] = make([]float64, U+1) - } - - for m := 0; m <= M; m++ { - // Compute p_{0,m}. This is zero except for U=0. - memo[0][0] = 1 - - // Compute the remainder of this row. - nlim := N - if m < nlim { - nlim = m - } - for n := 1; n <= nlim; n++ { - lp := memo[n-1] // p_{n-1,m} - var rp []float64 - if n <= m-1 { - rp = memo[n] // p_{n,m-1} - } else { - rp = memo[m-1] // p{m-1,n} and m==n - } - - // For a given n,m, U is at most n*m. - // - // TODO: Actually, it's at most ⌈n*m/2⌉, but - // then we need to use more complex symmetries - // in the inner loop below. - ulim := n * m - if U < ulim { - ulim = U - } - - out := memo[n] // p_{n,m} - nplusm := float64(n + m) - for U1 := ulim; U1 >= 0; U1-- { - l := 0.0 - if U1-m >= 0 { - l = float64(n) * lp[U1-m] - } - r := float64(m) * rp[U1] - out[U1] = (l + r) / nplusm - } - } - } - return memo[N] -} - -type ukey struct { - n1 int // size of first sample - twoU int // 2*U statistic for this permutation -} - -// This computes the cumulative counts of the Mann-Whitney U -// distribution in the presence of ties using the computation from -// Cheung, Ying Kuen; Klotz, Jerome H. (1997). "The Mann Whitney -// Wilcoxon Distribution Using Linked Lists". Statistica Sinica 7: -// 805-813, with much guidance from appendix L of Klotz, A -// Computational Approach to Statistics. -// -// makeUmemo constructs a table memo[K][ukey{n1, 2*U}], where K is the -// number of ranks (up to len(t)), n1 is the size of the first sample -// (up to the n1 argument), and U is the U statistic (up to the -// argument twoU/2). The value of an entry in the memo table is the -// number of permutations of a sample of size n1 in a ranking with tie -// vector t[:K] having a U statistic <= U. -func makeUmemo(twoU, n1 int, t []int) []map[ukey]float64 { - // Another candidate for a fast implementation is van de Wiel, - // "The split-up algorithm: a fast symbolic method for - // computing p-values of distribution-free statistics". This - // is what's used by R's coin package. It's a comparatively - // recent publication, so it's presumably faster (or perhaps - // just more general) than previous techniques, but I can't - // get my hands on the paper. - // - // TODO: ~40% of this function's time is spent in mapassign on - // the assignment lines in the two loops and another ~20% in - // map access and iteration. Improving map behavior or - // replacing the maps altogether with some other constant-time - // structure could double performance. - // - // TODO: The worst case for this function is when there are - // few ties. Yet the best case overall is when there are *no* - // ties. Can we get the best of both worlds? Use the fast - // algorithm for the most part when there are few ties and mix - // in the general algorithm just where we need it? That's - // certainly possible for sub-problems where t[:k] has no - // ties, but that doesn't help if t[0] has a tie but nothing - // else does. Is it possible to rearrange the ranks without - // messing up our computation of the U statistic for - // sub-problems? - - K := len(t) - - // Compute a coefficients. The a slice is indexed by k (a[0] - // is unused). - a := make([]int, K+1) - a[1] = t[0] - for k := 2; k <= K; k++ { - a[k] = a[k-1] + t[k-2] + t[k-1] - } - - // Create the memo table for the counts function, A. The A - // slice is indexed by k (A[0] is unused). - // - // In "The Mann Whitney Distribution Using Linked Lists", they - // use linked lists (*gasp*) for this, but within each K it's - // really just a memoization table, so it's faster to use a - // map. The outer structure is a slice indexed by k because we - // need to find all memo entries with certain values of k. - // - // TODO: The n1 and twoU values in the ukeys follow strict - // patterns. For each K value, the n1 values are every integer - // between two bounds. For each (K, n1) value, the twoU values - // are every integer multiple of a certain base between two - // bounds. It might be worth turning these into directly - // indexible slices. - A := make([]map[ukey]float64, K+1) - A[K] = map[ukey]float64{{n1: n1, twoU: twoU}: 0} - - // Compute memo table (k, n1, twoU) triples from high K values - // to low K values. This drives the recurrence relation - // downward to figure out all of the needed argument triples. - // - // TODO: Is it possible to generate this table bottom-up? If - // so, this could be a pure dynamic programming algorithm and - // we could discard the K dimension. We could at least store - // the inputs in a more compact representation that replaces - // the twoU dimension with an interval and a step size (as - // suggested by Cheung, Klotz, not that they make it at all - // clear *why* they're suggesting this). - tsum := sumint(t) // always ∑ t[0:k] - for k := K - 1; k >= 2; k-- { - tsum -= t[k] - A[k] = make(map[ukey]float64) - - // Construct A[k] from A[k+1]. - for A_kplus1 := range A[k+1] { - rkLow := maxint(0, A_kplus1.n1-tsum) - rkHigh := minint(A_kplus1.n1, t[k]) - for rk := rkLow; rk <= rkHigh; rk++ { - twoU_k := A_kplus1.twoU - rk*(a[k+1]-2*A_kplus1.n1+rk) - n1_k := A_kplus1.n1 - rk - if twoUmin(n1_k, t[:k], a) <= twoU_k && twoU_k <= twoUmax(n1_k, t[:k], a) { - key := ukey{n1: n1_k, twoU: twoU_k} - A[k][key] = 0 - } - } - } - } - - // Fill counts in memo table from low K values to high K - // values. This unwinds the recurrence relation. - - // Start with K==2 base case. - // - // TODO: Later computations depend on these, but these don't - // depend on anything (including each other), so if K==2, we - // can skip the memo table altogether. - if K < 2 { - panic("K < 2") - } - N_2 := t[0] + t[1] - for A_2i := range A[2] { - Asum := 0.0 - r2Low := maxint(0, A_2i.n1-t[0]) - r2High := (A_2i.twoU - A_2i.n1*(t[0]-A_2i.n1)) / N_2 - for r2 := r2Low; r2 <= r2High; r2++ { - Asum += mathChoose(t[0], A_2i.n1-r2) * - mathChoose(t[1], r2) - } - A[2][A_2i] = Asum - } - - // Derive counts for the rest of the memo table. - tsum = t[0] // always ∑ t[0:k-1] - for k := 3; k <= K; k++ { - tsum += t[k-2] - - // Compute A[k] counts from A[k-1] counts. - for A_ki := range A[k] { - Asum := 0.0 - rkLow := maxint(0, A_ki.n1-tsum) - rkHigh := minint(A_ki.n1, t[k-1]) - for rk := rkLow; rk <= rkHigh; rk++ { - twoU_kminus1 := A_ki.twoU - rk*(a[k]-2*A_ki.n1+rk) - n1_kminus1 := A_ki.n1 - rk - x, ok := A[k-1][ukey{n1: n1_kminus1, twoU: twoU_kminus1}] - if !ok && twoUmax(n1_kminus1, t[:k-1], a) < twoU_kminus1 { - x = mathChoose(tsum, n1_kminus1) - } - Asum += x * mathChoose(t[k-1], rk) - } - A[k][A_ki] = Asum - } - } - - return A -} - -func twoUmin(n1 int, t, a []int) int { - K := len(t) - twoU := -n1 * n1 - n1_k := n1 - for k := 1; k <= K; k++ { - twoU_k := minint(n1_k, t[k-1]) - twoU += twoU_k * a[k] - n1_k -= twoU_k - } - return twoU -} - -func twoUmax(n1 int, t, a []int) int { - K := len(t) - twoU := -n1 * n1 - n1_k := n1 - for k := K; k > 0; k-- { - twoU_k := minint(n1_k, t[k-1]) - twoU += twoU_k * a[k] - n1_k -= twoU_k - } - return twoU -} - -func (d UDist) PMF(U float64) float64 { - if U < 0 || U >= 0.5+float64(d.N1*d.N2) { - return 0 - } - - if d.hasTies() { - // makeUmemo computes the CDF directly. Take its - // difference to get the PMF. - p1, ok1 := makeUmemo(int(2*U)-1, d.N1, d.T)[len(d.T)][ukey{d.N1, int(2*U) - 1}] - p2, ok2 := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] - if !ok1 || !ok2 { - panic("makeUmemo did not return expected memoization table") - } - return (p2 - p1) / mathChoose(d.N1+d.N2, d.N1) - } - - // There are no ties. Use the fast algorithm. U must be integral. - Ui := int(math.Floor(U)) - // TODO: Use symmetry to minimize U - return d.p(Ui)[Ui] -} - -func (d UDist) CDF(U float64) float64 { - if U < 0 { - return 0 - } else if U >= float64(d.N1*d.N2) { - return 1 - } - - if d.hasTies() { - // TODO: Minimize U? - p, ok := makeUmemo(int(2*U), d.N1, d.T)[len(d.T)][ukey{d.N1, int(2 * U)}] - if !ok { - panic("makeUmemo did not return expected memoization table") - } - return p / mathChoose(d.N1+d.N2, d.N1) - } - - // There are no ties. Use the fast algorithm. U must be integral. - Ui := int(math.Floor(U)) - // The distribution is symmetric around U = m * n / 2. Sum up - // whichever tail is smaller. - flip := Ui >= (d.N1*d.N2+1)/2 - if flip { - Ui = d.N1*d.N2 - Ui - 1 - } - pdfs := d.p(Ui) - p := 0.0 - for _, pdf := range pdfs[:Ui+1] { - p += pdf - } - if flip { - p = 1 - p - } - return p -} - -func (d UDist) Step() float64 { - return 0.5 -} - -func (d UDist) Bounds() (float64, float64) { - // TODO: More precise bounds when there are ties. - return 0, float64(d.N1 * d.N2) -}
diff --git a/perf/go/stats/udist_test.go b/perf/go/stats/udist_test.go deleted file mode 100644 index baceb32..0000000 --- a/perf/go/stats/udist_test.go +++ /dev/null
@@ -1,326 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "fmt" - "math" - "testing" - - "go.skia.org/infra/go/testutils/unittest" -) - -func aeqTable(a, b [][]float64) bool { - if len(a) != len(b) { - return false - } - for i := range a { - if len(a[i]) != len(b[i]) { - return false - } - for j := range a[i] { - // "%f" precision - if math.Abs(a[i][j]-b[i][j]) >= 0.000001 { - return false - } - } - } - return true -} - -// U distribution for N=3 up to U=5. -var udist3 = [][]float64{ - // m=1 2 3 - {0.250000, 0.100000, 0.050000}, // U=0 - {0.500000, 0.200000, 0.100000}, // U=1 - {0.750000, 0.400000, 0.200000}, // U=2 - {1.000000, 0.600000, 0.350000}, // U=3 - {1.000000, 0.800000, 0.500000}, // U=4 - {1.000000, 0.900000, 0.650000}, // U=5 -} - -// U distribution for N=5 up to U=5. -var udist5 = [][]float64{ - // m=1 2 3 4 5 - {0.166667, 0.047619, 0.017857, 0.007937, 0.003968}, // U=0 - {0.333333, 0.095238, 0.035714, 0.015873, 0.007937}, // U=1 - {0.500000, 0.190476, 0.071429, 0.031746, 0.015873}, // U=2 - {0.666667, 0.285714, 0.125000, 0.055556, 0.027778}, // U=3 - {0.833333, 0.428571, 0.196429, 0.095238, 0.047619}, // U=4 - {1.000000, 0.571429, 0.285714, 0.142857, 0.075397}, // U=5 -} - -func TestUDist(t *testing.T) { - unittest.SmallTest(t) - makeTable := func(n int) [][]float64 { - out := make([][]float64, 6) - for U := 0; U < 6; U++ { - out[U] = make([]float64, n) - for m := 1; m <= n; m++ { - out[U][m-1] = UDist{N1: m, N2: n}.CDF(float64(U)) - } - } - return out - } - fmtTable := func(a [][]float64) string { - out := fmt.Sprintf("%8s", "m=") - for m := 1; m <= len(a[0]); m++ { - out += fmt.Sprintf("%9d", m) - } - out += "\n" - - for U, row := range a { - out += fmt.Sprintf("U=%-6d", U) - for m := 1; m <= len(a[0]); m++ { - out += fmt.Sprintf(" %f", row[m-1]) - } - out += "\n" - } - return out - } - - // Compare against tables given in Mann, Whitney (1947). - got3 := makeTable(3) - if !aeqTable(got3, udist3) { - t.Errorf("For n=3, want:\n%sgot:\n%s", fmtTable(udist3), fmtTable(got3)) - } - - got5 := makeTable(5) - if !aeqTable(got5, udist5) { - t.Errorf("For n=5, want:\n%sgot:\n%s", fmtTable(udist5), fmtTable(got5)) - } -} - -func BenchmarkUDist(b *testing.B) { - for i := 0; i < b.N; i++ { - // R uses the exact distribution up to N=50. - // N*M/2=1250 is the hardest point to get the CDF for. - UDist{N1: 50, N2: 50}.CDF(1250) - } -} - -func TestUDistTies(t *testing.T) { - unittest.SmallTest(t) - makeTable := func(m, N int, t []int, minx, maxx float64) [][]float64 { - out := [][]float64{} - dist := UDist{N1: m, N2: N - m, T: t} - for x := minx; x <= maxx; x += 0.5 { - // Convert x from uQt' to uQv'. - U := x - float64(m*m)/2 - P := dist.CDF(U) - if len(out) == 0 || !aeq(out[len(out)-1][1], P) { - out = append(out, []float64{x, P}) - } - } - return out - } - fmtTable := func(table [][]float64) string { - out := "" - for _, row := range table { - out += fmt.Sprintf("%5.1f %f\n", row[0], row[1]) - } - return out - } - - // Compare against Table 1 from Klotz (1966). - got := makeTable(5, 10, []int{1, 1, 2, 1, 1, 2, 1, 1}, 12.5, 19.5) - want := [][]float64{ - {12.5, 0.003968}, {13.5, 0.007937}, - {15.0, 0.023810}, {16.5, 0.047619}, - {17.5, 0.071429}, {18.0, 0.087302}, - {19.0, 0.134921}, {19.5, 0.138889}, - } - if !aeqTable(got, want) { - t.Errorf("Want:\n%sgot:\n%s", fmtTable(want), fmtTable(got)) - } - - got = makeTable(10, 21, []int{6, 5, 4, 3, 2, 1}, 52, 87) - want = [][]float64{ - {52.0, 0.000014}, {56.5, 0.000128}, - {57.5, 0.000145}, {60.0, 0.000230}, - {61.0, 0.000400}, {62.0, 0.000740}, - {62.5, 0.000797}, {64.0, 0.000825}, - {64.5, 0.001165}, {65.5, 0.001477}, - {66.5, 0.002498}, {67.0, 0.002725}, - {67.5, 0.002895}, {68.0, 0.003150}, - {68.5, 0.003263}, {69.0, 0.003518}, - {69.5, 0.003603}, {70.0, 0.005648}, - {70.5, 0.005818}, {71.0, 0.006626}, - {71.5, 0.006796}, {72.0, 0.008157}, - {72.5, 0.009688}, {73.0, 0.009801}, - {73.5, 0.010430}, {74.0, 0.011111}, - {74.5, 0.014230}, {75.0, 0.014612}, - {75.5, 0.017249}, {76.0, 0.018307}, - {76.5, 0.020178}, {77.0, 0.022270}, - {77.5, 0.023189}, {78.0, 0.026931}, - {78.5, 0.028207}, {79.0, 0.029979}, - {79.5, 0.030931}, {80.0, 0.038969}, - {80.5, 0.043063}, {81.0, 0.044262}, - {81.5, 0.046389}, {82.0, 0.049581}, - {82.5, 0.056300}, {83.0, 0.058027}, - {83.5, 0.063669}, {84.0, 0.067454}, - {84.5, 0.074122}, {85.0, 0.077425}, - {85.5, 0.083498}, {86.0, 0.094079}, - {86.5, 0.096693}, {87.0, 0.101132}, - } - if !aeqTable(got, want) { - t.Errorf("Want:\n%sgot:\n%s", fmtTable(want), fmtTable(got)) - } - - got = makeTable(8, 16, []int{2, 2, 2, 2, 2, 2, 2, 2}, 32, 54) - want = [][]float64{ - {32.0, 0.000078}, {34.0, 0.000389}, - {36.0, 0.001088}, {38.0, 0.002642}, - {40.0, 0.005905}, {42.0, 0.011500}, - {44.0, 0.021057}, {46.0, 0.035664}, - {48.0, 0.057187}, {50.0, 0.086713}, - {52.0, 0.126263}, {54.0, 0.175369}, - } - if !aeqTable(got, want) { - t.Errorf("Want:\n%sgot:\n%s", fmtTable(want), fmtTable(got)) - } - - // Check remaining tables from Klotz against the reference - // implementation. - checkRef := func(n1 int, tie []int) { - wantPMF1, wantCDF1 := udistRef(n1, tie) - - dist := UDist{N1: n1, N2: sumint(tie) - n1, T: tie} - gotPMF, wantPMF := [][]float64{}, [][]float64{} - gotCDF, wantCDF := [][]float64{}, [][]float64{} - N := sumint(tie) - for U := 0.0; U <= float64(n1*(N-n1)); U += 0.5 { - gotPMF = append(gotPMF, []float64{U, dist.PMF(U)}) - gotCDF = append(gotCDF, []float64{U, dist.CDF(U)}) - wantPMF = append(wantPMF, []float64{U, wantPMF1[int(U*2)]}) - wantCDF = append(wantCDF, []float64{U, wantCDF1[int(U*2)]}) - } - if !aeqTable(wantPMF, gotPMF) { - t.Errorf("For PMF of n1=%v, t=%v, want:\n%sgot:\n%s", n1, tie, fmtTable(wantPMF), fmtTable(gotPMF)) - } - if !aeqTable(wantCDF, gotCDF) { - t.Errorf("For CDF of n1=%v, t=%v, want:\n%sgot:\n%s", n1, tie, fmtTable(wantCDF), fmtTable(gotCDF)) - } - } - checkRef(5, []int{1, 1, 2, 1, 1, 2, 1, 1}) - checkRef(5, []int{1, 1, 2, 1, 1, 1, 2, 1}) - checkRef(5, []int{1, 3, 1, 2, 1, 1, 1}) - checkRef(8, []int{1, 2, 1, 1, 1, 1, 2, 2, 1, 2}) - checkRef(12, []int{3, 3, 4, 3, 4, 5}) - checkRef(10, []int{1, 2, 3, 4, 5, 6}) -} - -func BenchmarkUDistTies(b *testing.B) { - // Worst case: just one tie. - n := 20 - t := make([]int, 2*n-1) - for i := range t { - t[i] = 1 - } - t[0] = 2 - - for i := 0; i < b.N; i++ { - UDist{N1: n, N2: n, T: t}.CDF(float64(n*n) / 2) - } -} - -func XTestPrintUmemo(t *testing.T) { - // Reproduce table from Cheung, Klotz. - ties := []int{4, 5, 3, 4, 6} - printUmemo(makeUmemo(80, 10, ties), ties) -} - -// udistRef computes the PMF and CDF of the U distribution for two -// samples of sizes n1 and sum(t)-n1 with tie vector t. The returned -// pmf and cdf are indexed by 2*U. -// -// This uses the "graphical method" of Klotz (1966). It is very slow -// (Θ(∏ (t[i]+1)) = Ω(2^|t|)), but very correct, and hence useful as a -// reference for testing faster implementations. -func udistRef(n1 int, t []int) (pmf, cdf []float64) { - // Enumerate all u vectors for which 0 <= u_i <= t_i. Count - // the number of permutations of two samples of sizes n1 and - // sum(t)-n1 with tie vector t and accumulate these counts by - // their U statistics in count[2*U]. - counts := make([]int, 1+2*n1*(sumint(t)-n1)) - - u := make([]int, len(t)) - u[0] = -1 // Get enumeration started. -enumu: - for { - // Compute the next u vector. - u[0]++ - for i := 0; i < len(u) && u[i] > t[i]; i++ { - if i == len(u)-1 { - // All u vectors have been enumerated. - break enumu - } - // Carry. - u[i+1]++ - u[i] = 0 - } - - // Is this a legal u vector? - if sumint(u) != n1 { - // Klotz (1966) has a method for directly - // enumerating legal u vectors, but the point - // of this is to be correct, not fast. - continue - } - - // Compute 2*U statistic for this u vector. - twoU, vsum := 0, 0 - for i, u_i := range u { - v_i := t[i] - u_i - // U = U + vsum*u_i + u_i*v_i/2 - twoU += 2*vsum*u_i + u_i*v_i - vsum += v_i - } - - // Compute Π choose(t_i, u_i). This is the number of - // ways of permuting the input sample under u. - prod := 1 - for i, u_i := range u { - prod *= int(mathChoose(t[i], u_i) + 0.5) - } - - // Accumulate the permutations on this u path. - counts[twoU] += prod - - if false { - // Print a table in the form of Klotz's - // "direct enumeration" example. - // - // Convert 2U = 2UQV' to UQt' used in Klotz - // examples. - UQt := float64(twoU)/2 + float64(n1*n1)/2 - fmt.Printf("%+v %f %-2d\n", u, UQt, prod) - } - } - - // Convert counts into probabilities for PMF and CDF. - pmf = make([]float64, len(counts)) - cdf = make([]float64, len(counts)) - total := int(mathChoose(sumint(t), n1) + 0.5) - for i, count := range counts { - pmf[i] = float64(count) / float64(total) - if i > 0 { - cdf[i] = cdf[i-1] - } - cdf[i] += pmf[i] - } - return -} - -// printUmemo prints the output of makeUmemo for debugging. -func printUmemo(A []map[ukey]float64, t []int) { - fmt.Printf("K\tn1\t2*U\tpr\n") - for K := len(A) - 1; K >= 0; K-- { - for i, pr := range A[K] { - _, ref := udistRef(i.n1, t[:K]) - fmt.Printf("%v\t%v\t%v\t%v\t%v\n", K, i.n1, i.twoU, pr, ref[i.twoU]) - } - } -}
diff --git a/perf/go/stats/utest.go b/perf/go/stats/utest.go deleted file mode 100644 index 048ebe7..0000000 --- a/perf/go/stats/utest.go +++ /dev/null
@@ -1,274 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "math" - "sort" -) - -// A LocationHypothesis specifies the alternative hypothesis of a -// location test such as a t-test or a Mann-Whitney U-test. The -// default (zero) value is to test against the alternative hypothesis -// that they differ. -type LocationHypothesis int - -//go:generate stringer -type LocationHypothesis - -const ( - // LocationLess specifies the alternative hypothesis that the - // location of the first sample is less than the second. This - // is a one-tailed test. - LocationLess LocationHypothesis = -1 - - // LocationDiffers specifies the alternative hypothesis that - // the locations of the two samples are not equal. This is a - // two-tailed test. - LocationDiffers LocationHypothesis = 0 - - // LocationGreater specifies the alternative hypothesis that - // the location of the first sample is greater than the - // second. This is a one-tailed test. - LocationGreater LocationHypothesis = 1 -) - -// A MannWhitneyUTestResult is the result of a Mann-Whitney U-test. -type MannWhitneyUTestResult struct { - // N1 and N2 are the sizes of the input samples. - N1, N2 int - - // U is the value of the Mann-Whitney U statistic for this - // test, generalized by counting ties as 0.5. - // - // Given the Cartesian product of the two samples, this is the - // number of pairs in which the value from the first sample is - // greater than the value of the second, plus 0.5 times the - // number of pairs where the values from the two samples are - // equal. Hence, U is always an integer multiple of 0.5 (it is - // a whole integer if there are no ties) in the range [0, N1*N2]. - // - // U statistics always come in pairs, depending on which - // sample is "first". The mirror U for the other sample can be - // calculated as N1*N2 - U. - // - // There are many equivalent statistics with slightly - // different definitions. The Wilcoxon (1945) W statistic - // (generalized for ties) is U + (N1(N1+1))/2. It is also - // common to use 2U to eliminate the half steps and Smid - // (1956) uses N1*N2 - 2U to additionally center the - // distribution. - U float64 - - // AltHypothesis specifies the alternative hypothesis tested - // by this test against the null hypothesis that there is no - // difference in the locations of the samples. - AltHypothesis LocationHypothesis - - // P is the p-value of the Mann-Whitney test for the given - // null hypothesis. - P float64 -} - -// MannWhitneyExactLimit gives the largest sample size for which the -// exact U distribution will be used for the Mann-Whitney U-test. -// -// Using the exact distribution is necessary for small sample sizes -// because the distribution is highly irregular. However, computing -// the distribution for large sample sizes is both computationally -// expensive and unnecessary because it quickly approaches a normal -// approximation. Computing the distribution for two 50 value samples -// takes a few milliseconds on a 2014 laptop. -var MannWhitneyExactLimit = 50 - -// MannWhitneyTiesExactLimit gives the largest sample size for which -// the exact U distribution will be used for the Mann-Whitney U-test -// in the presence of ties. -// -// Computing this distribution is more expensive than computing the -// distribution without ties, so this is set lower. Computing this -// distribution for two 25 value samples takes about ten milliseconds -// on a 2014 laptop. -var MannWhitneyTiesExactLimit = 25 - -// MannWhitneyUTest performs a Mann-Whitney U-test [1,2] of the null -// hypothesis that two samples come from the same population against -// the alternative hypothesis that one sample tends to have larger or -// smaller values than the other. -// -// This is similar to a t-test, but unlike the t-test, the -// Mann-Whitney U-test is non-parametric (it does not assume a normal -// distribution). It has very slightly lower efficiency than the -// t-test on normal distributions. -// -// Computing the exact U distribution is expensive for large sample -// sizes, so this uses a normal approximation for sample sizes larger -// than MannWhitneyExactLimit if there are no ties or -// MannWhitneyTiesExactLimit if there are ties. This normal -// approximation uses both the tie correction and the continuity -// correction. -// -// This can fail with ErrSampleSize if either sample is empty or -// ErrSamplesEqual if all sample values are equal. -// -// This is also known as a Mann-Whitney-Wilcoxon test and is -// equivalent to the Wilcoxon rank-sum test, though the Wilcoxon -// rank-sum test differs in nomenclature. -// -// [1] Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of -// Whether one of Two Random Variables is Stochastically Larger than -// the Other". Annals of Mathematical Statistics 18 (1): 50–60. -// -// [2] Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer". -// Journal of the American Statistical Association 61 (315): 772-787. -func MannWhitneyUTest(x1, x2 []float64, alt LocationHypothesis) (*MannWhitneyUTestResult, error) { - n1, n2 := len(x1), len(x2) - if n1 == 0 || n2 == 0 { - return nil, ErrSampleSize - } - - // Compute the U statistic and tie vector T. - x1 = append([]float64(nil), x1...) - x2 = append([]float64(nil), x2...) - sort.Float64s(x1) - sort.Float64s(x2) - merged, labels := labeledMerge(x1, x2) - - R1 := 0.0 - T, hasTies := []int{}, false - for i := 0; i < len(merged); { - rank1, nx1, v1 := i+1, 0, merged[i] - // Consume samples that tie this sample (including itself). - for ; i < len(merged) && merged[i] == v1; i++ { - if labels[i] == 1 { - nx1++ - } - } - // Assign all tied samples the average rank of the - // samples, where merged[0] has rank 1. - if nx1 != 0 { - rank := float64(i+rank1) / 2 - R1 += rank * float64(nx1) - } - T = append(T, i-rank1+1) - if i > rank1 { - hasTies = true - } - } - U1 := R1 - float64(n1*(n1+1))/2 - - // Compute the smaller of U1 and U2 - U2 := float64(n1*n2) - U1 - Usmall := math.Min(U1, U2) - - var p float64 - if !hasTies && n1 <= MannWhitneyExactLimit && n2 <= MannWhitneyExactLimit || - hasTies && n1 <= MannWhitneyTiesExactLimit && n2 <= MannWhitneyTiesExactLimit { - // Use exact U distribution. U1 will be an integer. - if len(T) == 1 { - // All values are equal. Test is meaningless. - return nil, ErrSamplesEqual - } - - dist := UDist{N1: n1, N2: n2, T: T} - switch alt { - case LocationDiffers: - if U1 == U2 { - // The distribution is symmetric about - // Usmall. Since the distribution is - // discrete, the CDF is discontinuous - // and if simply double CDF(Usmall), - // we'll double count the - // (non-infinitesimal) probability - // mass at Usmall. What we want is - // just the integral of the whole CDF, - // which is 1. - p = 1 - } else { - p = dist.CDF(Usmall) * 2 - } - - case LocationLess: - p = dist.CDF(U1) - - case LocationGreater: - p = 1 - dist.CDF(U1-1) - } - } else { - // Use normal approximation (with tie and continuity - // correction). - t := tieCorrection(T) - N := float64(n1 + n2) - μ_U := float64(n1*n2) / 2 - σ_U := math.Sqrt(float64(n1*n2) * ((N + 1) - t/(N*(N-1))) / 12) - if σ_U == 0 { - return nil, ErrSamplesEqual - } - numer := U1 - μ_U - // Perform continuity correction. - switch alt { - case LocationDiffers: - numer -= mathSign(numer) * 0.5 - case LocationLess: - numer += 0.5 - case LocationGreater: - numer -= 0.5 - } - z := numer / σ_U - switch alt { - case LocationDiffers: - p = 2 * math.Min(StdNormal.CDF(z), 1-StdNormal.CDF(z)) - case LocationLess: - p = StdNormal.CDF(z) - case LocationGreater: - p = 1 - StdNormal.CDF(z) - } - } - - return &MannWhitneyUTestResult{N1: n1, N2: n2, U: U1, - AltHypothesis: alt, P: p}, nil -} - -// labeledMerge merges sorted lists x1 and x2 into sorted list merged. -// labels[i] is 1 or 2 depending on whether merged[i] is a value from -// x1 or x2, respectively. -func labeledMerge(x1, x2 []float64) (merged []float64, labels []byte) { - merged = make([]float64, len(x1)+len(x2)) - labels = make([]byte, len(x1)+len(x2)) - - i, j, o := 0, 0, 0 - for i < len(x1) && j < len(x2) { - if x1[i] < x2[j] { - merged[o] = x1[i] - labels[o] = 1 - i++ - } else { - merged[o] = x2[j] - labels[o] = 2 - j++ - } - o++ - } - for ; i < len(x1); i++ { - merged[o] = x1[i] - labels[o] = 1 - o++ - } - for ; j < len(x2); j++ { - merged[o] = x2[j] - labels[o] = 2 - o++ - } - return -} - -// tieCorrection computes the tie correction factor Σ_j (t_j³ - t_j) -// where t_j is the number of ties in the j'th rank. -func tieCorrection(ties []int) float64 { - t := 0 - for _, tie := range ties { - t += tie*tie*tie - tie - } - return float64(t) -}
diff --git a/perf/go/stats/utest_test.go b/perf/go/stats/utest_test.go deleted file mode 100644 index a3e3f19..0000000 --- a/perf/go/stats/utest_test.go +++ /dev/null
@@ -1,85 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "testing" - - "go.skia.org/infra/go/testutils/unittest" -) - -func TestMannWhitneyUTest(t *testing.T) { - unittest.SmallTest(t) - check := func(want, got *MannWhitneyUTestResult) { - if want.N1 != got.N1 || want.N2 != got.N2 || - !aeq(want.U, got.U) || - want.AltHypothesis != got.AltHypothesis || - !aeq(want.P, got.P) { - t.Errorf("want %+v, got %+v", want, got) - } - } - check3 := func(x1, x2 []float64, U float64, pless, pdiff, pgreater float64) { - want := &MannWhitneyUTestResult{N1: len(x1), N2: len(x2), U: U} - - want.AltHypothesis = LocationLess - want.P = pless - got, _ := MannWhitneyUTest(x1, x2, want.AltHypothesis) - check(want, got) - - want.AltHypothesis = LocationDiffers - want.P = pdiff - got, _ = MannWhitneyUTest(x1, x2, want.AltHypothesis) - check(want, got) - - want.AltHypothesis = LocationGreater - want.P = pgreater - got, _ = MannWhitneyUTest(x1, x2, want.AltHypothesis) - check(want, got) - } - - s1 := []float64{2, 1, 3, 5} - s2 := []float64{12, 11, 13, 15} - s3 := []float64{0, 4, 6, 7} // Interleaved with s1, but no ties - s4 := []float64{2, 2, 2, 2} - s5 := []float64{1, 1, 1, 1, 1} - - // Small sample, no ties - check3(s1, s2, 0, 0.014285714285714289, 0.028571428571428577, 1) - check3(s2, s1, 16, 1, 0.028571428571428577, 0.014285714285714289) - check3(s1, s3, 5, 0.24285714285714288, 0.485714285714285770, 0.8285714285714285) - - // Small sample, ties - // TODO: Check these against some other implementation. - check3(s1, s1, 8, 0.6285714285714286, 1, 0.6285714285714286) - check3(s1, s4, 10, 0.8571428571428571, 0.7142857142857143, 0.3571428571428571) - check3(s1, s5, 17.5, 1, 0, 0.04761904761904767) - - r, err := MannWhitneyUTest(s4, s4, LocationDiffers) - if err != ErrSamplesEqual { - t.Errorf("want ErrSamplesEqual, got %+v, %+v", r, err) - } - - // Large samples. - l1 := make([]float64, 500) - for i := range l1 { - l1[i] = float64(i * 2) - } - l2 := make([]float64, 600) - for i := range l2 { - l2[i] = float64(i*2 - 41) - } - l3 := append([]float64{}, l2...) - for i := 0; i < 30; i++ { - l3[i] = l1[i] - } - // For comparing with R's wilcox.test: - // l1 <- seq(0, 499)*2 - // l2 <- seq(0,599)*2-41 - // l3 <- l2; for (i in 1:30) { l3[i] = l1[i] } - - check3(l1, l2, 135250, 0.0024667680407086112, 0.0049335360814172224, 0.9975346930458906) - check3(l1, l1, 125000, 0.5000436801680628, 1, 0.5000436801680628) - check3(l1, l3, 134845, 0.0019351907119808942, 0.0038703814239617884, 0.9980659818257166) -}
diff --git a/perf/go/stats/util_test.go b/perf/go/stats/util_test.go deleted file mode 100644 index 4ab13f6..0000000 --- a/perf/go/stats/util_test.go +++ /dev/null
@@ -1,117 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package stats - -import ( - "fmt" - "math" - "sort" - "strings" - "testing" -) - -func testDiscreteCDF(t *testing.T, name string, dist DiscreteDist) { - // Build the expected CDF out of the PMF. - l, h := dist.Bounds() - s := dist.Step() - want := map[float64]float64{l - 0.1: 0, h: 1} - sum := 0.0 - for x := l; x < h; x += s { - sum += dist.PMF(x) - want[x] = sum - want[x+s/2] = sum - } - - testFunc(t, name, dist.CDF, want) -} - -func testInvCDF(t *testing.T, dist Dist, bounded bool) { - inv := InvCDF(dist) - name := fmt.Sprintf("InvCDF(%+v)", dist) - cdfName := fmt.Sprintf("CDF(%+v)", dist) - - // Test bounds. - vals := map[float64]float64{-0.01: nan, 1.01: nan} - if !bounded { - vals[0] = -inf - vals[1] = inf - } - testFunc(t, name, inv, vals) - - if bounded { - lo, hi := inv(0), inv(1) - vals := map[float64]float64{ - lo - 0.01: 0, lo: 0, - hi: 1, hi + 0.01: 1, - } - testFunc(t, cdfName, dist.CDF, vals) - if got := dist.CDF(lo + 0.01); !(got > 0) { - t.Errorf("%s(0)=%v, but %s(%v)=0", name, lo, cdfName, lo+0.01) - } - if got := dist.CDF(hi - 0.01); !(got < 1) { - t.Errorf("%s(1)=%v, but %s(%v)=1", name, hi, cdfName, hi-0.01) - } - } - - // Test points between. - vals = map[float64]float64{} - for _, p := range vecLinspace(0, 1, 11) { - if p == 0 || p == 1 { - continue - } - x := inv(p) - vals[x] = x - } - testFunc(t, fmt.Sprintf("InvCDF(CDF(%+v))", dist), - func(x float64) float64 { - return inv(dist.CDF(x)) - }, - vals) -} - -// aeq returns true if expect and got are equal to 8 significant -// figures (1 part in 100 million). -func aeq(expect, got float64) bool { - if expect < 0 && got < 0 { - expect, got = -expect, -got - } - return expect*0.99999999 <= got && got*0.99999999 <= expect -} - -func testFunc(t *testing.T, name string, f func(float64) float64, vals map[float64]float64) { - xs := make([]float64, 0, len(vals)) - for x := range vals { - xs = append(xs, x) - } - sort.Float64s(xs) - - for _, x := range xs { - want, got := vals[x], f(x) - if math.IsNaN(want) && math.IsNaN(got) || aeq(want, got) { - continue - } - var label string - if strings.Contains(name, "%v") { - label = fmt.Sprintf(name, x) - } else { - label = fmt.Sprintf("%s(%v)", name, x) - } - t.Errorf("want %s=%v, got %v", label, want, got) - } -} - -// vecLinspace returns num values spaced evenly between lo and hi, -// inclusive. If num is 1, this returns an array consisting of lo. -func vecLinspace(lo, hi float64, num int) []float64 { - res := make([]float64, num) - if num == 1 { - res[0] = lo - return res - } - for i := 0; i < num; i++ { - res[i] = lo + float64(i)*(hi-lo)/float64(num-1) - } - return res -}