miller/pkg/lib/stats.go
John Kerl 268a96d002
Export library code in pkg/ (#1391)
* Export library code in `pkg/`

* new doc page
2023-09-10 17:15:13 -04:00

278 lines
7.5 KiB
Go

// ================================================================
// These are intended for streaming (i.e. single-pass) applications. Otherwise
// the formulas look different (and are more intuitive).
// ================================================================
package lib
import (
"math"
)
// ----------------------------------------------------------------
// Univariate linear regression
// ----------------------------------------------------------------
// There are N (xi, yi) pairs.
//
// minimize E = sum (yi - m xi - b)^2
//
// Set the two partial derivatives to zero and solve for m and b:
//
// DE/Dm = sum 2 (yi - m xi - b) (-xi) = 0
// DE/Db = sum 2 (yi - m xi - b) (-1) = 0
//
// sum (yi - m xi - b) (xi) = 0
// sum (yi - m xi - b) = 0
//
// sum (xi yi - m xi^2 - b xi) = 0
// sum (yi - m xi - b) = 0
//
// m sum(xi^2) + b sum(xi) = sum(xi yi)
// m sum(xi) + b N = sum(yi)
//
// [ sum(xi^2) sum(xi) ] [ m ] = [ sum(xi yi) ]
// [ sum(xi) N ] [ b ] = [ sum(yi) ]
//
// [ m ] = [ sum(xi^2) sum(xi) ]^-1 [ sum(xi yi) ]
// [ b ] [ sum(xi) N ] [ sum(yi) ]
//
// = [ N -sum(xi) ] [ sum(xi yi) ] * 1/D
// [ -sum(xi) sum(xi^2)] [ sum(yi) ]
//
// where
//
// D = N sum(xi^2) - sum(xi)^2.
//
// So
//
// N sum(xi yi) - sum(xi) sum(yi)
// m = --------------------------------
// D
//
// -sum(xi)sum(xi yi) + sum(xi^2) sum(yi)
// b = ----------------------------------------
// D
//
// ----------------------------------------------------------------
func GetLinearRegressionOLS(
nint int64,
sumx float64,
sumx2 float64,
sumxy float64,
sumy float64,
) (m, b float64) {
n := float64(nint)
D := n*sumx2 - sumx*sumx
m = (n*sumxy - sumx*sumy) / D
b = (-sumx*sumxy + sumx2*sumy) / D
return m, b
}
// We would need a second pass through the data to compute the error-bars given
// the data and the m and the b.
//
// # Young 1962, pp. 122-124. Compute sample variance of linear
// # approximations, then variances of m and b.
// var_z = 0.0
// for i in range(0, N):
// var_z += (m * xs[i] + b - ys[i])**2
// var_z /= N
//
// var_m = (N * var_z) / D
// var_b = (var_z * sumx2) / D
//
// output = [m, b, math.sqrt(var_m), math.sqrt(var_b)]
// ----------------------------------------------------------------
// GetVar is the finalizing function for computing variance from streamed
// accumulator values.
func GetVar(
nint int64,
sumx float64,
sumx2 float64,
) float64 {
n := float64(nint)
mean := sumx / n
numerator := sumx2 - mean*(2.0*sumx-n*mean)
if numerator < 0.0 { // round-off error
numerator = 0.0
}
denominator := n - 1.0
return numerator / denominator
}
// ----------------------------------------------------------------
// Unbiased estimator:
// (1/n) sum{(xi-mean)**3}
// -----------------------------
// [(1/(n-1)) sum{(xi-mean)**2}]**1.5
// mean = sumx / n; n mean = sumx
// sum{(xi-mean)^3}
// = sum{xi^3 - 3 mean xi^2 + 3 mean^2 xi - mean^3}
// = sum{xi^3} - 3 mean sum{xi^2} + 3 mean^2 sum{xi} - n mean^3
// = sumx3 - 3 mean sumx2 + 3 mean^2 sumx - n mean^3
// = sumx3 - 3 mean sumx2 + 3n mean^3 - n mean^3
// = sumx3 - 3 mean sumx2 + 2n mean^3
// = sumx3 - mean*(3 sumx2 + 2n mean^2)
// sum{(xi-mean)^2}
// = sum{xi^2 - 2 mean xi + mean^2}
// = sum{xi^2} - 2 mean sum{xi} + n mean^2
// = sumx2 - 2 mean sumx + n mean^2
// = sumx2 - 2 n mean^2 + n mean^2
// = sumx2 - n mean^2
// ----------------------------------------------------------------
// GetSkewness is the finalizing function for computing skewness from streamed
// accumulator values.
func GetSkewness(
nint int,
sumx float64,
sumx2 float64,
sumx3 float64,
) float64 {
n := float64(nint)
mean := sumx / n
numerator := sumx3 - mean*(3*sumx2-2*n*mean*mean)
numerator = numerator / n
denominator := (sumx2 - n*mean*mean) / (n - 1)
denominator = math.Pow(denominator, 1.5)
return numerator / denominator
}
// ----------------------------------------------------------------
// Unbiased:
// (1/n) sum{(x-mean)**4}
// ----------------------- - 3
// [(1/n) sum{(x-mean)**2}]**2
// sum{(xi-mean)^4}
// = sum{xi^4 - 4 mean xi^3 + 6 mean^2 xi^2 - 4 mean^3 xi + mean^4}
// = sum{xi^4} - 4 mean sum{xi^3} + 6 mean^2 sum{xi^2} - 4 mean^3 sum{xi} + n mean^4
// = sum{xi^4} - 4 mean sum{xi^3} + 6 mean^2 sum{xi^2} - 4 n mean^4 + n mean^4
// = sum{xi^4} - 4 mean sum{xi^3} + 6 mean^2 sum{xi^2} - 3 n mean^4
// = sum{xi^4} - mean*(4 sum{xi^3} - 6 mean sum{xi^2} + 3 n mean^3)
// = sumx4 - mean*(4 sumx3 - 6 mean sumx2 + 3 n mean^3)
// = sumx4 - mean*(4 sumx3 - mean*(6 sumx2 - 3 n mean^2))
func GetKurtosis(
nint int,
sumx float64,
sumx2 float64,
sumx3 float64,
sumx4 float64,
) float64 {
n := float64(nint)
mean := sumx / n
numerator := sumx4 - mean*(4*sumx3-mean*(6*sumx2-3*n*mean*mean))
numerator = numerator / n
denominator := (sumx2 - n*mean*mean) / n
denominator = denominator * denominator
return numerator/denominator - 3.0
}
// ----------------------------------------------------------------
// Non-streaming implementation:
//
// def find_sample_covariance(xs, ys):
// n = len(xs)
// mean_x = find_mean(xs)
// mean_y = find_mean(ys)
//
// sum = 0.0
// for k in range(0, n):
// sum += (xs[k] - mean_x) * (ys[k] - mean_y)
//
// return sum / (n-1.0)
func GetCov(
nint int64,
sumx float64,
sumy float64,
sumxy float64,
) float64 {
n := float64(nint)
meanx := sumx / n
meany := sumy / n
numerator := sumxy - meanx*sumy - meany*sumx + n*meanx*meany
denominator := n - 1
return numerator / denominator
}
// ----------------------------------------------------------------
func GetCovMatrix(
nint int64,
sumx float64,
sumx2 float64,
sumy float64,
sumy2 float64,
sumxy float64,
) (Q [2][2]float64) {
n := float64(nint)
denominator := n - 1
Q[0][0] = (sumx2 - sumx*sumx/n) / denominator
Q[0][1] = (sumxy - sumx*sumy/n) / denominator
Q[1][0] = Q[0][1]
Q[1][1] = (sumy2 - sumy*sumy/n) / denominator
return Q
}
// ----------------------------------------------------------------
// Principal component analysis can be used for linear regression:
//
// * Compute the covariance matrix for the x's and y's.
//
// * Find its eigenvalues and eigenvectors of the cov. (This is real-symmetric
// so Jacobi iteration is simple and fine.)
//
// * The principal eigenvector points in the direction of the fit.
//
// * The covariance matrix is computed on zero-mean data so the intercept
// is zero. The fit equation is of the form (y - nu) = m*(x - mu) where mu
// and nu are x and y means, respectively.
//
// * If the fit is perfect then the 2nd eigenvalue will be zero; if the fit is
// good then the 2nd eigenvalue will be smaller; if the fit is bad then
// they'll be about the same. I use 1 - |lambda2|/|lambda1| as an indication
// of quality of the fit.
//
// Standard ("ordinary least-squares") linear regression is appropriate when
// the errors are thought to be all in the y's. PCA ("total least-squares") is
// appropriate when the x's and the y's are thought to both have errors.
func GetLinearRegressionPCA(
eigenvalue_1 float64,
eigenvalue_2 float64,
eigenvector_1 [2]float64,
eigenvector_2 [2]float64,
x_mean float64,
y_mean float64,
) (m, b, quality float64) {
abs_1 := math.Abs(eigenvalue_1)
abs_2 := math.Abs(eigenvalue_2)
quality = 1.0
if abs_1 == 0.0 {
quality = 0.0
} else if abs_2 > 0.0 {
quality = 1.0 - abs_2/abs_1
}
a0 := eigenvector_1[0]
a1 := eigenvector_1[1]
m = a1 / a0
b = y_mean - m*x_mean
return m, b, quality
}