"""
very basic statistics functions
"""
__author__ = "Philippe Guglielmetti"
__copyright__ = "Copyright 2012, Philippe Guglielmetti"
__credits__ = []
__license__ = "LGPL"
import math
import logging
import matplotlib
from Goulib import plot # sets matplotlib backend
import matplotlib.pyplot as plt # after import .plot
from Goulib import itertools2, math2, expr
[docs]def mean_var(data):
"""mean and variance by stable algorithm
:param
:return: float (mean, variance) of data
uses a stable algo by Knuth
"""
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
n = 0
mean = 0
M2 = 0
for x in data:
n += 1
delta = x - mean
mean += delta / n
M2 += delta * (x - mean)
if n < 2:
return mean, float('nan')
else:
return mean, M2 / (n - 1)
[docs]def mean(data):
""":return: mean of data"""
return mean_var(data)[0]
avg = mean # alias
[docs]def variance(data):
""":return: variance of data, faster (?) if mean is already available"""
return mean_var(data)[1]
var = variance # alias
[docs]def stddev(data):
""":return: standard deviation of data"""
return math.sqrt(variance(data))
[docs]def confidence_interval(data, conf=0.95):
""":return: (low,high) bounds of 95% confidence interval of data"""
m, v = mean_var(data)
e = 1.96 * math.sqrt(v) / math.sqrt(len(data))
return m - e, m + e
[docs]def mode(data, is_sorted=False):
""":return: mode (most frequent value) of data"""
# we could use a collection.Counter, but we're only looking for the largest value
x = data if is_sorted else sorted(data)
res, count = None, 0
prev, c = None, 0
x.append(None) # to force the last loop
for v in x:
if v == prev:
c += 1
else:
if c > count: # best so far
res, count = prev, c
c = 1
prev = v
x.pop() # no side effect please
return res
[docs]def kurtosis(data):
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
n = 0
mean = 0
M2 = 0
M3 = 0
M4 = 0
for x in data:
n1 = n
n = n + 1
delta = x - mean
delta_n = delta / n
delta_n2 = delta_n * delta_n
term1 = delta * delta_n * n1
mean = mean + delta_n
M4 = M4 + term1 * delta_n2 * \
(n * n - 3 * n + 3) + 6 * delta_n2 * M2 - 4 * delta_n * M3
M3 = M3 + term1 * delta_n * (n - 2) - 3 * delta_n * M2
M2 = M2 + term1
kurtosis = (n * M4) / (M2 * M2) - 3
return kurtosis
[docs]def covariance(data1, data2):
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Covariance
mean1 = mean2 = 0
M12 = 0
n = len(data1)
for i in range(n):
delta1 = (data1[i] - mean1) / (i + 1)
mean1 += delta1
delta2 = (data2[i] - mean2) / (i + 1)
mean2 += delta2
M12 += i * delta1 * delta2 - M12 / (i + 1)
return n / (n - 1.) * M12
[docs]def stats(l):
""":return: min,max,sum,sum2,avg,var of a list"""
s = Stats(l)
return s.lo, s.hi, s.sum1, s.sum2, s.avg, s.var
[docs]class Stats(object):
"""an object that computes mean, variance and modes of data that is appended to it
as in a list (but actual values are not stored)
"""
[docs] def __init__(self, data=[], mean=None, var=None):
self.lo = float("inf")
self.hi = float("-inf")
self.n = 0
self._offset = 0
self._dsum1 = 0
self._dsum2 = 0
if not data:
s2 = math.sqrt(var / 2)
data = [mean - s2, mean + s2]
self.extend(data)
[docs] def __repr__(self):
return "{}(mean={:.12g}, var={:.12g})".format(self.__class__.__name__, self.mu, self.var)
[docs] def append(self, x):
"""add data x to Stats"""
if (self.n == 0):
self._offset = x
self.n += 1
delta = x - self._offset
self._dsum1 += delta
self._dsum2 += delta * delta
if x < self.lo:
self.lo = x
if x > self.hi:
self.hi = x
[docs] def extend(self, data):
for x in data:
self.append(x)
[docs] def remove(self, data):
"""remove data from Stats
:param data: value or iterable of values
"""
if not hasattr(data, '__iter__'):
data = [data]
for x in data:
self.n -= 1
delta = x - self._offset
self._dsum1 -= delta
self._dsum2 -= delta * delta
if x <= self.lo:
logging.warning('lo value possibly invalid')
if x >= self.hi:
logging.warning('hi value possibly invalid')
@property
def sum(self):
return self._offset * self.n + self._dsum1
sum1 = sum # alias
@property
def sum2(self):
return self._dsum2 + self._offset * (2 * self.sum - self.n * self._offset)
@property
def mean(self):
return self._offset + self._dsum1 / self.n
avg = mean # alias
average = mean # alias
mu = mean # alias
@property
def variance(self):
if self.n < 2: # variance of a single data...
return 0
return (self._dsum2 - (self._dsum1 * self._dsum1) / self.n) / (self.n - 1)
var = variance # alias
@property
def stddev(self):
return math.sqrt(self.variance)
sigma = stddev
[docs] def __add__(self, other):
if math2.is_number(other):
other = Stats([other])
# https://fr.wikipedia.org/wiki/Variance_(statistiques_et_probabilit%C3%A9s)#Produit
try:
cov = covariance(self, other)
except:
cov = 0
mean = (self.mean + other.mean) / 2 # TODO : improve
var = self.variance + other.variance + 2 * cov
return Stats(mean=mean, var=var)
[docs] def __sub__(self, other):
return self + (-other)
[docs] def __mul__(self, other):
if math2.is_number(other):
mean = self.mean
var = other * self.variance
else: # it's a Stat
mean = self.mean * other.mean
# https://fr.wikipedia.org/wiki/Variance_(statistiques_et_probabilit%C3%A9s)#Produit
var = self.variance * other.variance + \
self.variance * other.mean ** 2 + other.variance * self.mean ** 2
return Stats(mean=mean, var=var)
[docs] def __neg__(self):
return self * (-1)
[docs] def __pow__(self, n):
from copy import copy
res = copy(self)
while n > 1:
res = res * self
n -= 1
return res
[docs] def covariance(self, other):
xy = (self - self.mean) * (other - other.mean)
return xy.mean
[docs]class Discrete(Stats):
"""discrete probability density function"""
[docs] def __init__(self, data):
"""
:param data: can be:
* list of equiprobable values (uniform distribution)
* dict of x:p values:probability pairs
"""
n = len(data)
if not isinstance(data, dict): # uniform distribution
data = list(data)
data = {i: 1 / n for i in data}
Stats.__init__(self, [x * data[x] * n for x in data])
self.pdf = data
[docs] def __call__(self, x):
if itertools2.isiterable(x):
return (self(x) for x in x)
if x in self.pdf:
return self.pdf[x]
else:
return 0
[docs]class PDF(expr.Expr, Stats):
"""probability density function"""
[docs] def __init__(self, pdf, data=[]):
Stats.__init__(self, data)
self.pdf = pdf
expr.Expr.__init__(self, pdf)
[docs] def __call__(self, x=None, **kwargs):
if itertools2.isiterable(x):
return (self(x) for x in x)
return self.pdf(x)
[docs]def normal_pdf(x, mu, sigma):
"""Return the probability density function at x"""
try:
return 1. / (math.sqrt(2 * math.pi) * sigma) * math.exp(-0.5 * (1. / sigma * (x - mu)) ** 2)
except ZeroDivisionError:
return 1 if math2.isclose(x, mean) else 0
expr.default_context.add_function(normal_pdf) # add to allowed functions
[docs]class Normal(PDF):
"""represents a normal distributed variable
the base class (list) optionally contains data
"""
[docs] def __init__(self, data=[], mean=0, var=1):
"""if data is specified, it it used to fit a normal law"""
sigma = math.sqrt(var)
s2 = math.sqrt(var / 2)
# this way we preserve mean and variance
data = data or [mean - s2, mean + s2]
super(Normal, self).__init__(
lambda x: normal_pdf(x, mean, sigma), data)
[docs] def __str__(self):
return Stats.__repr__(self)
[docs] def latex(self):
mean = expr.Expr(self.mean).latex()
sigma = expr.Expr(self.sigma).latex()
return r"\mathcal{N}(\mu=%s, \sigma=%s)" % (mean, sigma)
def _plot(self, ax, x=None, **kwargs):
if x is None:
x = itertools2.linspace(
self.mu - 3 * self.sigma, self.mu + 3 * self.sigma, 101)
x = list(x)
y = list(self(x))
return expr.Expr._plot(self, ax, x, y, **kwargs)
[docs] def linear(self, a, b=0):
"""
:return: a*self+b
"""
return Normal(mean=self.mean * a + b, var=abs(self.var * a))
[docs] def __mul__(self, a):
return self.linear(a, 0)
[docs] def __div__(self, a):
return self.linear(1. / a, 0)
__truediv__ = __div__
[docs] def __add__(self, other):
if isinstance(other, (int, float)):
return self.linear(1, other)
# else: assume other is a Normal variable
mean = self.mean + other.mean
var = self.var + other.var + 2 * self.cov(other)
return Normal(mean=mean, var=var)
[docs] def __radd__(self, other):
return self + other
[docs] def __neg__(self):
return self * (-1)
[docs] def __sub__(self, other):
return self + (-other)
[docs] def __rsub__(self, other):
return -(self - other)
[docs] def covariance(self, other):
try:
return mean(
math2.vecmul(
math2.vecsub(self, [], self.mean),
math2.vecsub(other, [], other.mean)
)
)
except:
return 0 # consider decorrelated
cov = covariance # alias
[docs] def pearson(self, other):
return self.cov(other) / (self.stddev * other.stddev)
correlation = pearson # alias
corr = pearson # alias
[docs]def linear_regression(x, y, conf=None):
"""
:param x,y: iterable data
:param conf: float confidence level [0..1]. If None, confidence intervals are not returned
:return: b0,b1,b2, (b0
Return the linear regression parameters and their <prob> confidence intervals.
ex:
>>> linear_regression([.1,.2,.3],[10,11,11.5],0.95)
"""
# https://gist.github.com/riccardoscalco/5356167
try:
import scipy.stats
import numpy # TODO remove these requirements
except:
logging.error('scipy needed')
return None
x = numpy.array(x)
y = numpy.array(y)
n = len(x)
xy = x * y
xx = x * x
# estimates
b1 = (xy.mean() - x.mean() * y.mean()) / (xx.mean() - x.mean() ** 2)
b0 = y.mean() - b1 * x.mean()
s2 = 1. / n * sum([(y[i] - b0 - b1 * x[i]) ** 2 for i in range(n)])
if not conf:
return b1, b0, s2
# confidence intervals
alpha = 1 - conf
c1 = scipy.stats.chi2.ppf(alpha / 2., n - 2)
c2 = scipy.stats.chi2.ppf(1 - alpha / 2., n - 2)
c = -1 * scipy.stats.t.ppf(alpha / 2., n - 2)
bb1 = c * (s2 / ((n - 2) * (xx.mean() - (x.mean()) ** 2))) ** .5
bb0 = c * ((s2 / (n - 2)) * (1 + (x.mean()) ** 2 /
(xx.mean() - (x.mean()) ** 2))) ** .5
return b1, b0, s2, (b1 - bb1, b1 + bb1), (b0 - bb0, b0 + bb0), (n * s2 / c2, n * s2 / c1)