Sunday, August 15, 2010

Descriptive Statistics for Sample with Frequency Data

Assume that we have a sequence of X values with corresponding counts or frequencies(positive integer values only, not checked!), freq. Then the previous formulas for nonweighted data are suitably modified to work with frequency weighted data.

Here is an incomplete Python program (we will come back to make it more comprehensive).


"""
xfreqdescstats.py

Descriptive statistics on frequency data (x, freq).
We assume that freq consists of positive integers.

version   0.0.1   aug.15, 2010  basic functions, w/out skew and kurtosis.

author    Dr. Ernesto Adorio
              UPDEPP(UP Clarkfield, Philippines)

email      ernesto.adorio@gmail.com
"""
from math import *

def xfi(X,freq, i):
    """
    element at position i for (data, freq) inputs
    WARNING: The X and its associated freq must be sorted by
             the elements in X.
    """ 
    n = sum(freq)
    if i < 0 or i >= n:
       return None
    tot = 0 
    for x,f in zip(X,freq):
        tot += f
        if i < tot: return x
    return None

def xfquantile(x,freq,   q,  qtype = 7, issorted = False):
    """
    Args:
       x - input data
       freq - frequency data
       q - quantile
       qtype - algorithm
       issorted- True if x already sorted.
       
    Compute quantiles from input array x given q.For median,
    specify q=0.5.
    
    References:
       http://reference.wolfram.com/mathematica/ref/Quantile.html
       http://wiki.r-project.org/rwiki/doku.php?id=rdoc:stats:quantile
    
    Author:
       Ernesto P.Adorio Ph.D.
       UP Extension Program in Pampanga, Clark Field.
    """
    if not issorted:
        xf = sorted(zip(X,freq))
        y,  yfreq=zip(*xf) #unzip!!
    else:
        y, yfreq = x, freq
    if not (1 <= qtype <= 9): 
       return None  # error!

    # Parameters for the Hyndman and Fan algorithm
    abcd = [(0,   0, 1, 0), # inverse empirical distrib.function., R type 1
            (0.5, 0, 1, 0), # similar to type 1, averaged, R type 2
            (0.5, 0, 0, 0), # nearest order statistic,(SAS) R type 3

            (0,   0, 0, 1), # California linear interpolation, R type 4
            (0.5, 0, 0, 1), # hydrologists method, R type 5
            (0,   1, 0, 1), # mean-based estimate(Weibull method), (SPSS,Minitab), type 6 
            (1,  -1, 0, 1), # mode-based method,(S, S-Plus), R type 7
            (1.0/3, 1.0/3, 0, 1), # median-unbiased ,  R type 8
            (3/8.0, 0.25, 0, 1)   # normal-unbiased, R type 9.
           ]

    a, b, c, d = abcd[qtype-1]
    n = sum(yfreq)
    g, j = modf( a + (n+b) * q -1)
    if j < 0:
        return y[0]
    elif j > n:
        return y[n]

    j = int(floor(j))
    if g ==  0:
       return xfi(y, yfreq, j)
    else:
        return xfi(y, yfreq, j) + (xfi(y, yfreq, j+1)-xfi(y, yfreq, j))* (c + d * g)    

def xfmean(X, freq):
    """
    Arithmetic mean of X.
    """
    n = float(sum(freq))
    return sum([x*f for x,f in zip(X, freq)])/ n

def xfhm(X, freq):
    """
    Harmonic mean of X
    """
    n = float(sum(freq))
    return n / sum([f/x for x, freq in zip(X, freq)])

def xfgm(X, freq):
    """
    Geometric mean of X.
    """
    n = float(sum(freq))
    return exp( sum([f * log(x) for x,f in zip(X, freq)])/n)
    
def xfrms(X, freq):
    """
    Root mean square of x.
    """
    n = float(sum(freq))
    return sqrt(sum([x*x * f for x, f in zip(X, freq)])/ n)     
    
def xfpvar(X, freq):
    """
    Pop Variance of X. 
    """
    n = float(sum(freq))
    xbar = xfmean(X, freq)
    sum([f * (x - xbar)**2 for x, f in zip(X, freq)])/n

def xfsvar(X, freq):
    """
    Sample variance of X.
    """
    n = float(sum(freq))
    return xfpvar(X, freq) * n/(n-1.0) 

def xfpstd(X, freq):
    """
    Pop. std. deviation of X
    """
    return sqrt(xfpvar(X, freq))

def xfsstd(X, freq):
    """
    Sample std. deviation of X
    """
    return sqrt(xfsvar(X, freq))

def xfmedian(X, freq):
    return xfquantile(X, freq, 0.50)
    
def xfQ1(X, freq):
 return xfquantile(X, freq, 0.25)
 
def xfQ3(X, freq):
 return xfquantile(X, freq, 0.75)

def xfdisttable(X,  freq):
    """
    Returns a table of tuples consisting of columns for X, freq, rf and crf.
    """
    n = float(sum(freq))
    rf  = [f/n for f in freq]
    crf = rf[:]
    for i in range(1,  len(rf)):    # Scipy has cumsum!
        crf[i] += crf[i-1]    
    return zip(X,  freq,  rf,  crf)

if __name__ == "__main__":
   X =    [1, 2, 5, 7]
   freq = [5,10, 9, 11]
   #for i in range(sum(freq)):
   #   print i, xfi(X, freq, i)
   print xfQ1(X,  freq),  xfmedian(X, freq) ,xfQ3(X,  freq) 
   for row in xfdisttable(X,  freq):
       print row


We will add more functions to the above Python code and we shall always be doing tests to see if it is correct. Add comments for your wishlist and suggested modifications to make the code robust.

No comments:

Post a Comment