In astronomy people often estimate a standard deviation in, say, background counts from the observed data themselves, since it is very difficult to know what true standard deviations are.
Then the question is this: How many measurements (e.g., the number of CCD pixels) do we need to estimate the true standard deviation?
I used the following code to find out empirically:
#!/usr/bin/env python2.6
import matplotlib.pyplot as plt
import numpy as np
def gauss(n, std):
return np.random.normal(0, std, n)
def uniform(n, std):
lim = std / 0.57735
return np.random.uniform(-lim, +lim, n)
def poisson(n, std):
return np.random.poisson(std * std, n)
def doeach(func, std0, nlogs, repeat=10000):
percenterrors = []
for nlog in nlogs:
n = int(10**nlog)
stds = []
for i in range(repeat):
values = func(n, std0)
stds.append(np.std(values, ddof=1))
std_in_measured = np.sqrt(((np.array(stds) - std0)**2).sum() / repeat)
percenterror = std_in_measured / std0
print('%6d %.3f %.3f' % (n, percenterror, 1./np.sqrt(n)))
percenterrors.append(percenterror)
return percenterrors
def main():
funcs = [(gauss, 1., 'Gaussian', 'b--'),
(poisson, 1., 'Poisson', 'r--'),
(uniform, 1., 'Uniform', 'g--')]
nlogs = np.linspace(0.5, 4.0, 20)
for f, std, label, symbol in funcs:
perrs = doeach(f, std, nlogs, repeat=10000)
plt.loglog(10**nlogs, perrs, symbol, lw=2, label=label)
plt.loglog(10**nlogs, 1./np.sqrt(10**nlogs), '-', lw=2, c='gray',
label='1/sqrt(n)')
plt.xlabel('Number of measurements')
plt.ylabel('Std. dev. in measured std. dev. / true std. dev.')
plt.xlim(2, 2e4)
plt.legend()
plt.grid(which='both')
plt.show()
if __name__ == '__main__':
main()
If you assume the counts are drawn from three distributions (Gaussian, Poisson, and uniform, of somewhat arbitrary means and widths) then the fractional uncertainty in the estimate of true standard deviation falls off like where
is the number of measurements that go into the estimate.
So the rule of thumb is that you need measurements to achieve a fractional uncertainty better than
.
