"""
file   descstats.py
author Ernesto P. Adorio
       UPDEPP (U.P. Clarkfield)
desc   basic stats
version 0.0.1  aug. 15, 2010
"""
from quantile import quantile
from math import sqrt,  exp,  log
def mean(X, npasses = 1):
    """
    arithmetic mean of vector X.
    Arguments:
        npasses : 1- no correction term.
                        2- with correction term.
    """
    n = float(len(X))
    avg = sum(X)/n
    if npasses > 1:
      return avg + sum([(x - avg) for x in X])/ n
    return avg
    
def rms(X):
    """
    root mean square average of vector X.
    """
    return sqrt(sum([x*x for x in X])/len(X))
def hm(X):
    """
    harmonic mean of vector X
    """
    if min(X)<= 0.0:
        return None
    return len(X) / sum([1.0/x for x in X])
def gm(X):
    """
    geometric mean of X
    """
    if min(X) <= 0.0:
        return None
    return exp(sum([log(x) for x in X])/len(X))
def pvar(X , npasses = 1):
    """
    Population variance of X, divisor is n.
    """
    sxsqr = sum([x*x for x in X])
    xbar  = mean(X, npasses)
    n = len(X)
    if npasses == 2:
      return sum([(x - xbar)**2 for x in X])/n
    else:
      return sxsqr/n - (xbar)**2
def svar(X, npasses = 1):
    """
    Sample variance of X, divisor is n-1.
    """
    n = len(X)
    return pvar(X, npasses) * n / (n-1.0)
def pstd(X, npasses = 1):
    """
    Population standard deviation.
    """
    return sqrt(pvar(X, npasses))
def sstd(X, npasses = 1):
    """
    Sample standard deviation.
    """
    return sqrt((svar(X, npasses)))
avg = mean
am = mean
pstddev = pstd
psdev = pstd
ssdev = sstd
sstddev = sstd
def median(X):
    """
    Median of  X
    """
    return quantile(X, 0.5)
def mad(X,  xbar = None):
    """
    Mean absolute deviation of X from mean.
    """
    if xbar is None: xbar = mean(X)
    return float(sum([abs(x-xbar) for x in X]))/len(X)
                    
madmean = mad
def madmed(X,  xmed = None):
    """
    Mean absolute deviation of X from median.
    """
    if xmed is None: xmed = median(X)
    return float(sum([abs(x- xmed) for x in X]))/len(X)
                    
def Q1(X):
    """
    First quartile of X
    """
    return quantile(X, 0.25)
def Q3(X):
    """
    Third quartile of X
    """
    return quantile(X, 0.75)
def midhinge(X):
    """
    average of first and third quartile of X. 
    """
    return 0.5 * (Q1(X) + Q2(X))
def trimean(X):
    return 0.25 * (Q1(X) + 2 * median(X) + Q3(X))
def IQR(X):
    """
    Interquartile range of X.
    """
    return Q3(X) - Q1(x)
def summary(X):
    """
    Tuple of quartiles of X.
    """
    return (min(X), Q1(X), median(X), Q3(x), max(X))
def skew(X, npasses = 1, xbar=None, sdev = None):
    """
    Skewness of X.
    """
    if xbar is None: xbar = mean(X, npasses)
    if sdev is None: sdev = sstd(X, npasses)
    S = sum([(x - xbar)**3 for x in X])/((len(X) -1.0)* sdev**3)
    return S
def kurt(X, npasses = 1, xbar = None, sdev = None):
    """
    Excess Kurtosis of X.
    """
    if xbar is None: xbar = mean(X, npasses)
    if sdev is None: sdev = sstd(X, npasses)
    K = sum([(x - xbar)**4 for x in X])/((len(X)-1.0) * sdev**4) - 3.0
    return K
if  __name__ == "__main__":
    import scipy.stats as stat
    X = stat.norm.rvs(loc = 5,  scale = 2,  size=10)
    print "mean = ",  mean(X)
    print "rms= ",  rms(X)
    print "hm=", hm(X)
    print "gm=",  gm(X)
    print "min=",  min(X)
    print "Q1 = ",  Q1(X)
    print "median=", median(X)
    print "Q3 = ",  Q3(X)
    print "max=",  max(X)
    print "skew=", skew(X)
    print "kurt =",  kurt(X)
When the above code was run, it printed the following:
python descstats.py
mean =  4.84676942423
rms=  5.03778343707
hm= 4.32905902716
gm= 4.61426927173
min= 2.0131960908
Q1 =  4.13817781222
median= 4.92637729064
Q3 =  5.47954029648
max= 7.31749490871
skew= -0.233219987989
kurt = -0.280050258646
The quantile.py may be found in the article Statistics: Computing quantiles with Python.TBD. lower and upper hinges.Some authors define lower and upper hinges as synonymous with Q1 and Q3 respectively but we will check with the more authoritative R software later.
R follows Tukey 1977. See boxplot.stats:
ReplyDeleteThe two ‘hinges’ are versions of the first and third quartile, i.e., close to quantile(x, c(1,3)/4). The hinges equal the quartiles for odd n (where n <- length(x)) and differ for even n. Whereas the quartiles only equal observations for n %% 4 == 1 (n = 1 mod 4), the hinges do so additionally for n %% 4 == 2 (n = 2 mod 4), and are in the middle of two observations otherwise.