# Source code for pyretis.analysis.analysis

```# -*- coding: utf-8 -*-
# Copyright (c) 2023, PyRETIS Development Team.
"""Module defining functions useful in the analysis of simulation data.

Important methods defined here
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

running_average (:py:func:`.running_average`)
Method to calculate a running average.

block_error (:py:func:`.block_error`)
Perform block error analysis.

block_error_corr (:py:func:`.block_error_corr`)
Method to run a block error analysis and calculate relative
errors and correlation length.
"""
import numpy as np
from pyretis.analysis.histogram import histogram_and_avg

__all__ = ['running_average', 'block_error', 'block_error_corr']
np.seterr(divide='ignore', invalid='ignore')

[docs]def running_average(data):
"""Create a running average of the given data.

The running average will be calculated over the rows.

Parameters
----------
data : numpy.array
This is the data we will average.

Returns
-------
out : numpy.array
The running average.

"""
if len(data.shape) == 2:
w_data = data[:, 0]*data[:, 1]
return w_data.cumsum() / data[:, 1].cumsum()
return data.cumsum() / np.ones(data.shape[0]).cumsum()

[docs]def block_error(data, maxblock=None, blockskip=1, weights=None):
"""Perform block error analysis.

This function will estimate the standard deviation in the input
data by performing a block analysis. The number of blocks
to consider can be specified or it will be taken as the
half of the length of the input data. Averages and variance are
calculated using an on-the-fly algorithm [1]_.

Parameters
----------
data : numpy.array (or iterable with data points)
The data to analyse.
maxblock : int, optional
Can be used to set the maximum length of the blocks to
consider. Note that the `maxblock` will never be set longer
than half the length in data.
blockskip : int, optional
This can be used to skip certain block lengths, i.e.
`blockskip = 1` will consider all blocks up to `maxblock`, while
`blockskip = n` will consider every n'th block up to `maxblock`,
i.e. it will use block lengths equal to `1`, `1 + n`, `1 + 2*n`,
and so on.

Returns
-------
blocklen : numpy.array
These contain the block lengths considered.
block_avg : numpy.array
The averages as a function of the block length.
block_err : numpy.array
Estimate of errors as a function of the block length.
block_err_avg : float
Average of the error estimate using blocks where
``length > maxblock//2``.

References
----------
.. [1] Wikipedia, "Algorithms for calculating variance",
http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance

"""
if maxblock is None or maxblock < 1:
maxblock = len(data) // 2
else:
maxblock = min(maxblock, len(data) // 2)
indata = data
if weights is None:
if len(data.shape) == 1:
weights = np.ones(len(data))*2
else:
data = (i[0] for i in indata)
weights = (i[1] for i in indata)
# define helper variables:
blocklen = np.arange(0, maxblock, blockskip, dtype=np.int_)
# blocklen contains the lengths of the blocks
blocklen += 1
# +1 to make blocklen[i] = length of block no i where numbering
# starts at 0 -> blocklen[0] = 1 and so on. Note that arange does
# create [0, ..., maxblock).
block = np.zeros(len(blocklen))  # to accumulate values for a block
blockw = np.zeros(len(blocklen))  # accumulate weights for a block
tot_w = np.zeros(block.shape)  # accumulate total weights
tot_w_s = np.zeros(block.shape)  # accumulate total weights**2
block_avg = np.zeros(block.shape)  # to store averages in block
block_var = np.zeros(block.shape)  # estimator of variance

# Algorithm taken from
# https://markusthill.github.io/math/stats/ml/
# online-estimation-of-weighted-sample-mean-and-coviarance-matrix/
# equation 39 where we delay the normalization to the end
for i, (datai, weighti) in enumerate(zip(data, weights)):
block += weighti*datai  # accumulate the value to all blocks
blockw += weighti
# next pick out blocks which are "full":
k = np.where((i + 1) % blocklen == 0)[0]
# update estimate of average and variance
block[k] = np.nan_to_num(block[k] / blockw[k])  # catch 0/0 = NaN
tot_w[k] += blockw[k]
tot_w_s[k] += blockw[k]**2
delta1 = blockw[k] * (block[k] - block_avg[k])
block_avg[k] = block_avg[k] + np.nan_to_num(delta1 / tot_w[k])
block_var[k] = block_var[k] + delta1 * (block[k] - block_avg[k])
# reset these blocks
block[k] = 0.0
blockw[k] = 0

nsamp = tot_w * tot_w / tot_w_s
block_var *= nsamp / (nsamp - 1) / tot_w
block_err = np.sqrt(block_var / nsamp)  # estimate of error
block_err_avg = np.average(
block_err[np.where(blocklen > maxblock // 2)[0]])
return blocklen, block_avg, block_err, block_err_avg

[docs]def block_error_corr(data, maxblock=None, blockskip=1):
"""Run block error analysis on the given data.

This will run the block error analysis and return the relative
errors and correlation length.

Parameters
----------
data : numpy.array
Data to analyse.
maxblock : int, optional
The maximum block length to consider.
blockskip : int, optional
This can be used to skip certain block lengths, i.e.
`blockskip = 1` will consider all blocks up to `maxblock`, while
`blockskip = n` will consider every n'th block up to `maxblock`,
i.e. it will use block lengths equal to `1`, `1 + n`, `1 + 2*n`,
and so on.

Returns
-------
out[0] : numpy.array
These contains the block lengths considered (`blen`).
out[1] : numpy.array
Estimate of errors as a function of the block length (`berr`).
out[2] : float
Average of the error estimate for blocks (`berr_avg`)
with ``length > maxblock // 2``.
out[3] : numpy.array
Estimate of relative errors normalised by the overall average
as a function of block length (`rel_err`).
out[4] : float
The average relative error (`avg_rel_err`), for blocks
with ``length > maxblock // 2``.
out[5] : numpy.array
The estimated correlation length as a function of the block
length (`ncor`).
out[6] : float
The average (for blocks with length > maxblock // 2) estimated
correlation length (`avg_ncor`).

"""
blen, bavg, berr, berr_avg = block_error(data, maxblock=maxblock,
blockskip=blockskip)
# also calculate some relative errors:
rel_err = np.divide(berr, abs(bavg[0]))
avg_rel_err = np.divide(berr_avg, abs(bavg[0]))
ncor = np.divide(berr**2, berr[0]**2)
avg_ncor = np.divide(berr_avg**2, berr[0]**2)
return blen, berr, berr_avg, rel_err, avg_rel_err, ncor, avg_ncor

def mean_square_displacement(data, ndt=None):
"""Calculate the mean square displacement for the given data.

Parameters
----------
data : numpy.array, 1D
This numpy.array contains the data as a function of time.
ndt : int, optional
This parameter is the number of time origins. I.e. points up to
ndt will be used as time origins. If not specified the value of
the input ``data.size // 5`` will be used.

Returns
-------
msd : numpy.array, 2D
The first column is the mean squared displacement and the
second column is the corresponding standard deviation.

"""
length = data.shape[0]
if ndt is None or ndt < 1:
ndt = length // 5
msd = []
for i in range(1, ndt):
delta = (data[i:] - data[:-i])**2
msd.append((delta.mean(), delta.std()))
return np.array(msd)

def analyse_data(data, settings):
"""Analyse the given data and run some common analysis procedures.

Specifically, it will:

1) Calculate a running average.

2) Obtain a histogram.

3) Run a block error analysis.

Parameters
----------
data : numpy.array, 1D
This numpy.array contains the data as a function of time.
settings : dict
This dictionary contains settings for the analysis.

Returns
-------
result : dict
This dict contains the results.

"""
result = {}
asett = settings['analysis']
# 1) Do the running average
result['running'] = running_average(data)
# 2) Obtain distributions:
result['distribution'] = histogram_and_avg(data, asett['bins'],
density=True)
# 3) Do the block error analysis:
result['blockerror'] = block_error_corr(data,
maxblock=asett['maxblock'],
blockskip=asett['blockskip'])
return result
```