# -*- coding: utf-8 -*-
'''
Copyright 2015 The pysimoa Developers
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
# Python 2/3 compatibility
from __future__ import division
import numpy as np
import scipy.stats
[docs]def online_variance(row_a, row_b):
r"""
Combine the sample mean and variance of two samples
This is a function ready for use for `reduce`.
It is commutative and associative.
Parameters
----------
row_a, row_b
Tuple-like variables with first item being the number of data points
in that sample, the second item being the mean of that sample,
and the third item being the sum of squared differences of the data
points to the mean in that sample.
Returns
-------
ret : tuple
``(n_ab, mean_ab, m2_ab)``
n_ab
number of data points in the combined sample
mean_ab
mean of the combined sample
m2_ab
sum of squared difference to the mean.
To get the variance, divide by ``n_ab - 1``.
Notes
-----
.. math::
n_{ab} = n_a + n_b
\bar{x}_ab = \frac{n_a \bar{x}_a + n_b \bar{x}_b}{n_{ab}}
S_{ab} = S_a + S_b + \frac{n_a n_b}{n_{ab}} (\bar{x}_b - \bar{x}_a)^2
References
----------
https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms for
computing the sample variance: Analysis and recommendations." The American
Statistician 37.3 (1983): 242-247.
http://www.jstor.org/stable/2683386
Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Updating formulae
and a pairwise algorithm for computing sample variances." COMPSTAT 1982 5th
Symposium held at Toulouse 1982. Physica-Verlag HD, 1982.
http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf
"""
n_a, mean_a, m2_a = row_a
n_b, mean_b, m2_b = row_b
n_ab = n_a + n_b
mean_ab = (mean_a * n_a + mean_b * n_b) / (n_a + n_b)
m2_ab = m2_a + m2_b + np.square(mean_b - mean_a) * n_a * n_b / n_ab
return n_ab, mean_ab, m2_ab
[docs]def von_neumann_ratio_test(data, alpha, verbose=False):
"""
Test a series of observations for independence (lack of autocorrelation)
The von-Neumann ratio test is a statistical test designed for testing the
independence of subsequent observations.
The null hypothesis is that the data are independent and normally
distributed.
Parameters
----------
alpha: float
Significance level.
This is the probability threshold below which the null hypothesis will
be rejected.
Notes
-----
Given a series :math:`x_i` of :math:`n` data points, the von-Neumann test
statistic is [1]_ [6]_
.. math::
v = \frac{\sum_{i=1}^{n-1} (x_{i+1} - x_i)^2}{\sum_{i=1}^n (x_i -
\bar{x})^2
Under the null hypothesis, the mean :math:`\bar{v} = 2` and the variance
:math:`\sigma^2_v = \frac{4 (n - 2)}{(n-1)(n+1)}` [3]_.
References
----------
.. [1] Von Neumann, J. (1941). Distribution of the ratio of the mean square
successive difference to the variance. The Annals of Mathematical
Statistics, 12(4), 367-395.
.. [2] The Mean Square Successive Difference
J. von Neumann, R. H. Kent, H. R. Bellinson and B. I. Hart
The Annals of Mathematical Statistics
Vol. 12, No. 2 (Jun., 1941) , pp. 153-162
http://www.jstor.org/stable/2235765
.. [3] Moments of the Ratio of the Mean Square Successive Difference to the
Mean Square Difference in Samples From a Normal Universe J. D. Williams
The Annals of Mathematical Statistics
Vol. 12, No. 2 (Jun., 1941) , pp. 239-241
http://www.jstor.org/stable/2235775
.. [4] Madansky, Albert,
Testing for Independence of Observations,
In: Prescriptions for Working Statisticians
Springer New York
10.1007/978-1-4612-3794-5_4
http://dx.doi.org/10.1007/978-1-4612-3794-5_4
"""
mean_square_successive_difference = np.power(np.ediff1d(data), 2).mean()
von_neumann_ratio = mean_square_successive_difference / data.var()
von_neumann_mean = 2.
von_neumann_var = 4. * (data.size - 2.) / (data.size ** 2 - 1.)
acceptance_region = scipy.stats.norm.interval(
1.0 - alpha,
loc=von_neumann_mean,
scale=np.sqrt(von_neumann_var)
)
if verbose:
print(
(
"von Neumann ratio v = {:.2f},\n"
"acceptance region at {:.0f}% significance level is "
"({:.2f}, {:.2f})"
).format(
von_neumann_ratio,
alpha * 100,
acceptance_region[0],
acceptance_region[1]
)
)
return acceptance_region[0] < von_neumann_ratio < acceptance_region[1]