|
| 1 | +// Copyright 2025 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +// Package stats provides basic descriptive statistics. |
| 6 | +// |
| 7 | +// This is intended not as a comprehensive statistics package, but |
| 8 | +// to provide common, everyday statistical functions. |
| 9 | +// |
| 10 | +// As a rule of thumb, a statistical function belongs in this package |
| 11 | +// if it would be explained in a typical high school. |
| 12 | +// |
| 13 | +// These functions aim to balance performance and accuracy, but some |
| 14 | +// amount of error is inevitable in floating-point computations. |
| 15 | +// The underlying implementations may change, resulting in small |
| 16 | +// changes in their results from version to version. If the caller |
| 17 | +// needs particular guarantees on accuracy and overflow behavior or |
| 18 | +// version stability, they should use a more specialized |
| 19 | +// implementation. |
| 20 | +package stats |
| 21 | + |
| 22 | +// References: |
| 23 | +// |
| 24 | +// Hyndman, Rob J.; Fan, Yanan (November 1996). |
| 25 | +// "Sample Quantiles in Statistical Packages". |
| 26 | +// American Statistician. 50 (4). |
| 27 | +// American Statistical Association: 361–365. |
| 28 | +// doi:10.2307/2684934. JSTOR 2684934. |
| 29 | + |
| 30 | +import ( |
| 31 | + "math" |
| 32 | + "slices" |
| 33 | +) |
| 34 | + |
| 35 | +// Mean returns the arithmetic mean of the values in values. |
| 36 | +// |
| 37 | +// Mean does not modify the array. |
| 38 | +// |
| 39 | +// Mean panics if values is an empty slice. |
| 40 | +// |
| 41 | +// If values contains NaN or both Inf and -Inf, it returns NaN. |
| 42 | +// If values contains Inf, it returns Inf. If values contains -Inf, it returns -Inf. |
| 43 | +func Mean(values []float64) float64 { |
| 44 | + mean, infs := meanInf(values) |
| 45 | + switch infs { |
| 46 | + case negInf: |
| 47 | + return math.Inf(-1) |
| 48 | + case posInf: |
| 49 | + return math.Inf(1) |
| 50 | + case negInf | posInf: |
| 51 | + return math.NaN() |
| 52 | + default: // passthrough mean or NaN |
| 53 | + } |
| 54 | + return mean |
| 55 | +} |
| 56 | + |
| 57 | +// MeanAndStdDev returns the arithmetic mean and |
| 58 | +// sample standard deviation of values; the standard |
| 59 | +// deviation is only defined for len(values) > 1. |
| 60 | +// |
| 61 | +// MeanAndStdDev does not modify the array. |
| 62 | +// |
| 63 | +// MeanAndStdDev panics if values is an empty slice. |
| 64 | +// |
| 65 | +// If values contains NaN, it returns NaN, NaN. |
| 66 | +// If values contains both Inf and -Inf, it returns NaN, Inf. |
| 67 | +// If values contains Inf, it returns Inf, Inf. |
| 68 | +// If values contains -Inf, it returns -Inf, Inf. |
| 69 | +func MeanAndStdDev(values []float64) (float64, float64) { |
| 70 | + mean, infs := meanInf(values) |
| 71 | + switch infs { |
| 72 | + case 0: |
| 73 | + if math.IsNaN(mean) { |
| 74 | + return mean, math.NaN() |
| 75 | + } |
| 76 | + case negInf, posInf, negInf | posInf: |
| 77 | + return mean, math.Inf(1) |
| 78 | + } |
| 79 | + if len(values) == 1 { |
| 80 | + return mean, 0 |
| 81 | + } |
| 82 | + squaredDiffs := 0.0 |
| 83 | + for _, v := range values { |
| 84 | + diff := v - mean |
| 85 | + squaredDiffs += diff * diff |
| 86 | + } |
| 87 | + return mean, math.Sqrt(squaredDiffs / float64(len(values)-1)) |
| 88 | +} |
| 89 | + |
| 90 | +// meanInf calculates a naive mean value |
| 91 | +// and reports the infinities status. |
| 92 | +func meanInf(values []float64) (float64, infinities) { |
| 93 | + if len(values) == 0 { |
| 94 | + panic("mean: empty slice") |
| 95 | + } |
| 96 | + sum, infs := 0.0, infinities(0) |
| 97 | + for _, v := range values { |
| 98 | + switch { |
| 99 | + case math.IsInf(v, 1): |
| 100 | + infs |= posInf |
| 101 | + case math.IsInf(v, -1): |
| 102 | + infs |= negInf |
| 103 | + } |
| 104 | + sum += v |
| 105 | + } |
| 106 | + return sum / float64(len(values)), infs |
| 107 | +} |
| 108 | + |
| 109 | +// infinities is a bitset that records the presence of ±Inf in the input |
| 110 | +type infinities uint8 |
| 111 | + |
| 112 | +const ( |
| 113 | + negInf infinities = 1 << iota |
| 114 | + posInf |
| 115 | +) |
| 116 | + |
| 117 | +// Median returns the median of the values in values. |
| 118 | +// |
| 119 | +// Median does not modify the array. |
| 120 | +// |
| 121 | +// Median may perform asymptotically faster and allocate |
| 122 | +// asymptotically less if the slice is already sorted. |
| 123 | +// |
| 124 | +// If values is an empty slice, it panics. |
| 125 | +// If values contains NaN, it returns NaN. |
| 126 | +// -Inf is treated as smaller than all other values, |
| 127 | +// Inf is treated as larger than all other values, and |
| 128 | +// -0.0 is treated as smaller than 0.0. |
| 129 | +func Median(values []float64) float64 { return Quantiles(values, 0.5)[0] } |
| 130 | + |
| 131 | +// Quantiles returns a sequence of quantiles of values. |
| 132 | +// |
| 133 | +// The returned slice has the same length as the quantiles slice, |
| 134 | +// and the elements are one-to-one with the input quantiles. |
| 135 | +// A quantile of 0 corresponds to the minimum value in values and |
| 136 | +// a quantile of 1 corresponds to the maximum value in values. |
| 137 | +// A quantile of 0.5 is the same as the value returned by [Median]. |
| 138 | +// |
| 139 | +// Quantiles does not modify the array. |
| 140 | +// |
| 141 | +// Quantiles may perform asymptotically faster and allocate |
| 142 | +// asymptotically less if the slice is already sorted. |
| 143 | +// |
| 144 | +// Quantiles panics if values is an empty slice or any |
| 145 | +// quantile is not contained in the interval [0, 1]. |
| 146 | +// |
| 147 | +// If values contains NaN, it returns [NaN, ..., NaN]. |
| 148 | +// -Inf is treated as smaller than all other values, |
| 149 | +// Inf is treated as larger than all other values, and |
| 150 | +// -0.0 is treated as smaller than 0.0. |
| 151 | +func Quantiles(values []float64, quantiles ...float64) []float64 { |
| 152 | + if len(values) == 0 { |
| 153 | + panic("quantiles: empty slice") |
| 154 | + } |
| 155 | + if !slices.IsSorted(values) { |
| 156 | + values = slices.Clone(values) |
| 157 | + slices.Sort(values) |
| 158 | + } |
| 159 | + res := make([]float64, len(quantiles)) |
| 160 | + if math.IsNaN(values[0]) { |
| 161 | + for i := range res { |
| 162 | + res[i] = math.NaN() |
| 163 | + } |
| 164 | + return res |
| 165 | + } |
| 166 | + for i, q := range quantiles { |
| 167 | + if !(0 <= q && q <= 1) { |
| 168 | + panic("quantile must be contained in the interval [0, 1]") |
| 169 | + } |
| 170 | + // There are many methods for computing quantiles. Quantiles uses the |
| 171 | + // "inclusive" method, also known as Q7 in Hyndman and Fan, or the |
| 172 | + // "linear" or "R-7" method. This assumes that the data is either a |
| 173 | + // population or a sample that includes the most extreme values of the |
| 174 | + // underlying population. |
| 175 | + res[i] = hyndmanFanR7(values, q) |
| 176 | + } |
| 177 | + return res |
| 178 | +} |
| 179 | + |
| 180 | +// hyndmanFanR7 implements the Hyndman and Fan "R-7" |
| 181 | +// method of computing interpolated quantile values |
| 182 | +// over a sorted slice of vals. |
| 183 | +// |
| 184 | +// hyndmanFanR7 does not modify the array. |
| 185 | +func hyndmanFanR7(values []float64, q float64) float64 { |
| 186 | + h := float64(len(values)-1)*q + 1 |
| 187 | + // the h-th smallest of len(vals) values is at fn(h)-1. |
| 188 | + return values[floor(h-1)] + (h-math.Floor(h))*(values[ceil(h-1)]-values[floor(h-1)]) |
| 189 | +} |
| 190 | + |
| 191 | +// ceil returns the integer value of [math.Ceil]. |
| 192 | +func ceil(n float64) int { return int(math.Ceil(n)) } |
| 193 | + |
| 194 | +// floor returns the integer value of [math.Floor]. |
| 195 | +func floor(n float64) int { return int(math.Floor(n)) } |
0 commit comments