From 42bad5891a6538d434b7353f72a3eb7575692231 Mon Sep 17 00:00:00 2001 From: Nuno Cruces Date: Wed, 22 Jan 2025 12:09:20 +0000 Subject: [PATCH] Skewness and excess kurtosis. --- ext/stats/moments.go | 13 ++++- ext/stats/moments_test.go | 18 +++---- ext/stats/stats.go | 111 ++++++++++++++++++++++++++++++-------- ext/stats/stats_test.go | 19 ++++++- 4 files changed, 127 insertions(+), 34 deletions(-) diff --git a/ext/stats/moments.go b/ext/stats/moments.go index 70f6faca..09136196 100644 --- a/ext/stats/moments.go +++ b/ext/stats/moments.go @@ -2,6 +2,7 @@ package stats import "math" +// Fisher–Pearson skewness and kurtosis using // Terriberry's algorithm with Kahan summation: // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics @@ -45,6 +46,10 @@ func (m moments) skewness_samp() float64 { } func (m moments) kurtosis_pop() float64 { + return m.raw_kurtosis_pop() - 3 +} + +func (m moments) raw_kurtosis_pop() float64 { m2 := m.m2.hi if div := m2 * m2; div != 0 { return m.m4.hi * float64(m.n) / div @@ -53,9 +58,15 @@ func (m moments) kurtosis_pop() float64 { } func (m moments) kurtosis_samp() float64 { + n := m.n + k := math.FMA(m.raw_kurtosis_pop(), float64(n+1), float64(3-3*n)) + return k * float64(n-1) / float64((n-2)*(n-3)) +} + +func (m moments) raw_kurtosis_samp() float64 { n := m.n // https://mathworks.com/help/stats/kurtosis.html#f4975293 - k := math.FMA(m.kurtosis_pop(), float64(n+1), float64(3-3*n)) + k := math.FMA(m.raw_kurtosis_pop(), float64(n+1), float64(3-3*n)) return math.FMA(k, float64(n-1)/float64((n-2)*(n-3)), 3) } diff --git a/ext/stats/moments_test.go b/ext/stats/moments_test.go index 1c9707fa..814348ab 100644 --- a/ext/stats/moments_test.go +++ b/ext/stats/moments_test.go @@ -14,7 +14,7 @@ func Test_moments(t *testing.T) { if !math.IsNaN(s1.skewness_pop()) { t.Errorf("want NaN") } - if !math.IsNaN(s1.kurtosis_pop()) { + if !math.IsNaN(s1.raw_kurtosis_pop()) { t.Errorf("want NaN") } @@ -29,16 +29,16 @@ func Test_moments(t *testing.T) { s1.enqueue(+3.5784) s1.enqueue(+2.7694) - if got := float32(s1.skewness_pop()); got != 0.106098293 { + if got := s1.skewness_pop(); float32(got) != 0.106098293 { t.Errorf("got %v, want 0.1061", got) } - if got := float32(s1.skewness_samp()); got != 0.1258171 { + if got := s1.skewness_samp(); float32(got) != 0.1258171 { t.Errorf("got %v, want 0.1258", got) } - if got := float32(s1.kurtosis_pop()); got != 2.3121266 { + if got := s1.raw_kurtosis_pop(); float32(got) != 2.3121266 { t.Errorf("got %v, want 2.3121", got) } - if got := float32(s1.kurtosis_samp()); got != 2.7482237 { + if got := s1.raw_kurtosis_samp(); float32(got) != 2.7482237 { t.Errorf("got %v, want 2.7483", got) } @@ -72,16 +72,16 @@ func Test_moments(t *testing.T) { s1.dequeue(math.E) s1.dequeue(math.Sqrt2) - if got := float32(s1.skewness_pop()); got != 0.106098293 { + if got := s1.skewness_pop(); float32(got) != 0.106098293 { t.Errorf("got %v, want 0.1061", got) } - if got := float32(s1.skewness_samp()); got != 0.1258171 { + if got := s1.skewness_samp(); float32(got) != 0.1258171 { t.Errorf("got %v, want 0.1258", got) } - if got := float32(s1.kurtosis_pop()); got != 2.3121266 { + if got := s1.raw_kurtosis_pop(); float32(got) != 2.3121266 { t.Errorf("got %v, want 2.3121", got) } - if got := float32(s1.kurtosis_samp()); got != 2.7482237 { + if got := s1.raw_kurtosis_samp(); float32(got) != 2.7482237 { t.Errorf("got %v, want 2.7483", got) } } diff --git a/ext/stats/stats.go b/ext/stats/stats.go index 371a3906..b0c7ff31 100644 --- a/ext/stats/stats.go +++ b/ext/stats/stats.go @@ -1,13 +1,17 @@ // Package stats provides aggregate functions for statistics. // // Provided functions: -// - stddev_pop: population standard deviation -// - stddev_samp: sample standard deviation // - var_pop: population variance // - var_samp: sample variance +// - stddev_pop: population standard deviation +// - stddev_samp: sample standard deviation +// - skewness_pop: Pearson population skewness +// - skewness_samp: Pearson sample skewness +// - kurtosis_pop: Fisher population excess kurtosis +// - kurtosis_samp: Fisher sample excess kurtosis // - covar_pop: population covariance // - covar_samp: sample covariance -// - corr: correlation coefficient +// - corr: Pearson correlation coefficient // - regr_r2: correlation coefficient squared // - regr_avgx: average of the independent variable // - regr_avgy: average of the dependent variable @@ -61,6 +65,10 @@ func Register(db *sqlite3.Conn) error { db.CreateWindowFunction("var_samp", 1, flags, newVariance(var_samp)), db.CreateWindowFunction("stddev_pop", 1, flags, newVariance(stddev_pop)), db.CreateWindowFunction("stddev_samp", 1, flags, newVariance(stddev_samp)), + db.CreateWindowFunction("skewness_pop", 1, flags, newMoments(skewness_pop)), + db.CreateWindowFunction("skewness_samp", 1, flags, newMoments(skewness_samp)), + db.CreateWindowFunction("kurtosis_pop", 1, flags, newMoments(kurtosis_pop)), + db.CreateWindowFunction("kurtosis_samp", 1, flags, newMoments(kurtosis_samp)), db.CreateWindowFunction("covar_pop", 2, flags, newCovariance(var_pop)), db.CreateWindowFunction("covar_samp", 2, flags, newCovariance(var_samp)), db.CreateWindowFunction("corr", 2, flags, newCovariance(corr)), @@ -88,6 +96,10 @@ const ( var_samp stddev_pop stddev_samp + skewness_pop + skewness_samp + kurtosis_pop + kurtosis_samp corr regr_r2 regr_sxx @@ -101,6 +113,23 @@ const ( regr_json ) +func special(kind int, n int64) (null, zero bool) { + switch kind { + case var_pop, stddev_pop, regr_sxx, regr_syy, regr_sxy: + return n <= 0, n == 1 + case regr_avgx, regr_avgy: + return n <= 0, false + case kurtosis_samp: + return n <= 3, false + case skewness_samp: + return n <= 2, false + case skewness_pop: + return n <= 1, n == 2 + default: + return n <= 1, false + } +} + func newVariance(kind int) func() sqlite3.AggregateFunction { return func() sqlite3.AggregateFunction { return &variance{kind: kind} } } @@ -111,14 +140,11 @@ type variance struct { } func (fn *variance) Value(ctx sqlite3.Context) { - switch fn.n { - case 1: - switch fn.kind { - case var_pop, stddev_pop: - ctx.ResultFloat(0) - } + switch null, zero := special(fn.kind, fn.n); { + case zero: + ctx.ResultFloat(0) return - case 0: + case null: return } @@ -166,18 +192,11 @@ func (fn *covariance) Value(ctx sqlite3.Context) { ctx.ResultInt64(fn.regr_count()) return } - switch fn.n { - case 1: - switch fn.kind { - case var_pop, stddev_pop, regr_sxx, regr_syy, regr_sxy: - ctx.ResultFloat(0) - return - case regr_avgx, regr_avgy: - break - default: - return - } - case 0: + switch null, zero := special(fn.kind, fn.n); { + case zero: + ctx.ResultFloat(0) + return + case null: return } @@ -234,3 +253,51 @@ func (fn *covariance) Inverse(ctx sqlite3.Context, arg ...sqlite3.Value) { fn.dequeue(fa, fb) } } + +func newMoments(kind int) func() sqlite3.AggregateFunction { + return func() sqlite3.AggregateFunction { return &momentfn{kind: kind} } +} + +type momentfn struct { + kind int + moments +} + +func (fn *momentfn) Value(ctx sqlite3.Context) { + switch null, zero := special(fn.kind, fn.n); { + case zero: + ctx.ResultFloat(0) + return + case null: + return + } + + var r float64 + switch fn.kind { + case skewness_pop: + r = fn.skewness_pop() + case skewness_samp: + r = fn.skewness_samp() + case kurtosis_pop: + r = fn.kurtosis_pop() + case kurtosis_samp: + r = fn.kurtosis_samp() + } + ctx.ResultFloat(r) +} + +func (fn *momentfn) Step(ctx sqlite3.Context, arg ...sqlite3.Value) { + a := arg[0] + f := a.Float() + if f != 0.0 || a.NumericType() != sqlite3.NULL { + fn.enqueue(f) + } +} + +func (fn *momentfn) Inverse(ctx sqlite3.Context, arg ...sqlite3.Value) { + a := arg[0] + f := a.Float() + if f != 0.0 || a.NumericType() != sqlite3.NULL { + fn.dequeue(f) + } +} diff --git a/ext/stats/stats_test.go b/ext/stats/stats_test.go index fec5a7ce..4008ce46 100644 --- a/ext/stats/stats_test.go +++ b/ext/stats/stats_test.go @@ -49,7 +49,9 @@ func TestRegister_variance(t *testing.T) { SELECT sum(x), avg(x), var_samp(x), var_pop(x), - stddev_samp(x), stddev_pop(x) + stddev_samp(x), stddev_pop(x), + skewness_samp(x), skewness_pop(x), + kurtosis_samp(x), kurtosis_pop(x) FROM data`) if err != nil { t.Fatal(err) @@ -73,13 +75,26 @@ func TestRegister_variance(t *testing.T) { if got := stmt.ColumnFloat(5); got != math.Sqrt(22.5) { t.Errorf("got %v, want √22.5", got) } + if got := stmt.ColumnFloat(6); got != 0 { + t.Errorf("got %v, want zero", got) + } + if got := stmt.ColumnFloat(7); got != 0 { + t.Errorf("got %v, want zero", got) + } + if got := stmt.ColumnFloat(8); float32(got) != -3.3 { + t.Errorf("got %v, want -3.3", got) + } + if got := stmt.ColumnFloat(9); got != -1.64 { + t.Errorf("got %v, want -1.64", got) + } } stmt.Close() stmt, _, err = db.Prepare(` SELECT var_samp(x) OVER (ROWS 1 PRECEDING), - var_pop(x) OVER (ROWS 1 PRECEDING) + var_pop(x) OVER (ROWS 1 PRECEDING), + skewness_pop(x) OVER (ROWS 1 PRECEDING) FROM data`) if err != nil { t.Fatal(err)