The code here uses the BFGS algorithm but performs now a production run and writes to file all average values of the energy.
# 2-electron VMC code for 2dim quantum dot with importance sampling
# Using gaussian rng for new positions and Metropolis- Hastings
# Added energy minimization
from math import exp, sqrt
from random import random, seed, normalvariate
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from scipy.optimize import minimize
import sys
import os
# Where to save data files
PROJECT_ROOT_DIR = "Results"
DATA_ID = "Results/EnergyMin"
if not os.path.exists(PROJECT_ROOT_DIR):
os.mkdir(PROJECT_ROOT_DIR)
if not os.path.exists(DATA_ID):
os.makedirs(DATA_ID)
def data_path(dat_id):
return os.path.join(DATA_ID, dat_id)
outfile = open(data_path("Energies.dat"),'w')
# Trial wave function for the 2-electron quantum dot in two dims
def WaveFunction(r,alpha,beta):
r1 = r[0,0]**2 + r[0,1]**2
r2 = r[1,0]**2 + r[1,1]**2
r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2)
deno = r12/(1+beta*r12)
return exp(-0.5*alpha*(r1+r2)+deno)
# Local energy for the 2-electron quantum dot in two dims, using analytical local energy
def LocalEnergy(r,alpha,beta):
r1 = (r[0,0]**2 + r[0,1]**2)
r2 = (r[1,0]**2 + r[1,1]**2)
r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2)
deno = 1.0/(1+beta*r12)
deno2 = deno*deno
return 0.5*(1-alpha*alpha)*(r1 + r2) +2.0*alpha + 1.0/r12+deno2*(alpha*r12-deno2+2*beta*deno-1.0/r12)
# Derivate of wave function ansatz as function of variational parameters
def DerivativeWFansatz(r,alpha,beta):
WfDer = np.zeros((2), np.double)
r1 = (r[0,0]**2 + r[0,1]**2)
r2 = (r[1,0]**2 + r[1,1]**2)
r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2)
deno = 1.0/(1+beta*r12)
deno2 = deno*deno
WfDer[0] = -0.5*(r1+r2)
WfDer[1] = -r12*r12*deno2
return WfDer
# Setting up the quantum force for the two-electron quantum dot, recall that it is a vector
def QuantumForce(r,alpha,beta):
qforce = np.zeros((NumberParticles,Dimension), np.double)
r12 = sqrt((r[0,0]-r[1,0])**2 + (r[0,1]-r[1,1])**2)
deno = 1.0/(1+beta*r12)
qforce[0,:] = -2*r[0,:]*alpha*(r[0,:]-r[1,:])*deno*deno/r12
qforce[1,:] = -2*r[1,:]*alpha*(r[1,:]-r[0,:])*deno*deno/r12
return qforce
# Computing the derivative of the energy and the energy
def EnergyDerivative(x0):
# Parameters in the Fokker-Planck simulation of the quantum force
D = 0.5
TimeStep = 0.05
# positions
PositionOld = np.zeros((NumberParticles,Dimension), np.double)
PositionNew = np.zeros((NumberParticles,Dimension), np.double)
# Quantum force
QuantumForceOld = np.zeros((NumberParticles,Dimension), np.double)
QuantumForceNew = np.zeros((NumberParticles,Dimension), np.double)
energy = 0.0
DeltaE = 0.0
alpha = x0[0]
beta = x0[1]
EnergyDer = 0.0
DeltaPsi = 0.0
DerivativePsiE = 0.0
#Initial position
for i in range(NumberParticles):
for j in range(Dimension):
PositionOld[i,j] = normalvariate(0.0,1.0)*sqrt(TimeStep)
wfold = WaveFunction(PositionOld,alpha,beta)
QuantumForceOld = QuantumForce(PositionOld,alpha, beta)
#Loop over MC MCcycles
for MCcycle in range(NumberMCcycles):
#Trial position moving one particle at the time
for i in range(NumberParticles):
for j in range(Dimension):
PositionNew[i,j] = PositionOld[i,j]+normalvariate(0.0,1.0)*sqrt(TimeStep)+\
QuantumForceOld[i,j]*TimeStep*D
wfnew = WaveFunction(PositionNew,alpha,beta)
QuantumForceNew = QuantumForce(PositionNew,alpha, beta)
GreensFunction = 0.0
for j in range(Dimension):
GreensFunction += 0.5*(QuantumForceOld[i,j]+QuantumForceNew[i,j])*\
(D*TimeStep*0.5*(QuantumForceOld[i,j]-QuantumForceNew[i,j])-\
PositionNew[i,j]+PositionOld[i,j])
GreensFunction = exp(GreensFunction)
ProbabilityRatio = GreensFunction*wfnew**2/wfold**2
#Metropolis-Hastings test to see whether we accept the move
if random() <= ProbabilityRatio:
for j in range(Dimension):
PositionOld[i,j] = PositionNew[i,j]
QuantumForceOld[i,j] = QuantumForceNew[i,j]
wfold = wfnew
DeltaE = LocalEnergy(PositionOld,alpha,beta)
DerPsi = DerivativeWFansatz(PositionOld,alpha,beta)
DeltaPsi += DerPsi
energy += DeltaE
DerivativePsiE += DerPsi*DeltaE
# We calculate mean values
energy /= NumberMCcycles
DerivativePsiE /= NumberMCcycles
DeltaPsi /= NumberMCcycles
EnergyDer = 2*(DerivativePsiE-DeltaPsi*energy)
return EnergyDer
# Computing the expectation value of the local energy
def Energy(x0):
# Parameters in the Fokker-Planck simulation of the quantum force
D = 0.5
TimeStep = 0.05
# positions
PositionOld = np.zeros((NumberParticles,Dimension), np.double)
PositionNew = np.zeros((NumberParticles,Dimension), np.double)
# Quantum force
QuantumForceOld = np.zeros((NumberParticles,Dimension), np.double)
QuantumForceNew = np.zeros((NumberParticles,Dimension), np.double)
energy = 0.0
DeltaE = 0.0
alpha = x0[0]
beta = x0[1]
#Initial position
for i in range(NumberParticles):
for j in range(Dimension):
PositionOld[i,j] = normalvariate(0.0,1.0)*sqrt(TimeStep)
wfold = WaveFunction(PositionOld,alpha,beta)
QuantumForceOld = QuantumForce(PositionOld,alpha, beta)
#Loop over MC MCcycles
for MCcycle in range(NumberMCcycles):
#Trial position moving one particle at the time
for i in range(NumberParticles):
for j in range(Dimension):
PositionNew[i,j] = PositionOld[i,j]+normalvariate(0.0,1.0)*sqrt(TimeStep)+\
QuantumForceOld[i,j]*TimeStep*D
wfnew = WaveFunction(PositionNew,alpha,beta)
QuantumForceNew = QuantumForce(PositionNew,alpha, beta)
GreensFunction = 0.0
for j in range(Dimension):
GreensFunction += 0.5*(QuantumForceOld[i,j]+QuantumForceNew[i,j])*\
(D*TimeStep*0.5*(QuantumForceOld[i,j]-QuantumForceNew[i,j])-\
PositionNew[i,j]+PositionOld[i,j])
GreensFunction = exp(GreensFunction)
ProbabilityRatio = GreensFunction*wfnew**2/wfold**2
#Metropolis-Hastings test to see whether we accept the move
if random() <= ProbabilityRatio:
for j in range(Dimension):
PositionOld[i,j] = PositionNew[i,j]
QuantumForceOld[i,j] = QuantumForceNew[i,j]
wfold = wfnew
DeltaE = LocalEnergy(PositionOld,alpha,beta)
energy += DeltaE
if Printout:
outfile.write('%f\n' %(energy/(MCcycle+1.0)))
# We calculate mean values
energy /= NumberMCcycles
return energy
#Here starts the main program with variable declarations
NumberParticles = 2
Dimension = 2
# seed for rng generator
seed()
# Monte Carlo cycles for parameter optimization
Printout = False
NumberMCcycles= 10000
# guess for variational parameters
x0 = np.array([0.9,0.2])
# Using Broydens method to find optimal parameters
res = minimize(Energy, x0, method='BFGS', jac=EnergyDerivative, options={'gtol': 1e-4,'disp': True})
x0 = res.x
# Compute the energy again with the optimal parameters and increased number of Monte Cycles
NumberMCcycles= 2**19
Printout = True
FinalEnergy = Energy(x0)
EResult = np.array([FinalEnergy,FinalEnergy])
outfile.close()
#nice printout with Pandas
import pandas as pd
from pandas import DataFrame
data ={'Optimal Parameters':x0, 'Final Energy':EResult}
frame = pd.DataFrame(data)
print(frame)
Note that the minimize function returns the final values for the variable α=x0[0] and β=x0[1] in the array x.
When we have found the minimum, we use these optimal parameters to perform a production run of energies. The output is in turn written to file and is used, together with resampling methods like the blocking method, to obtain the best possible estimate for the standard deviation. The optimal minimum is, even with our guess, rather close to the exact value of 3.0 a.u.
Our next step is to use the output values of the energy and perform a blocking analysis of the results in order to get a best possible estimate of the standard deviation.
The next step is then to use the above data sets and perform a resampling analysis, either using say the Bootstrap method or the Blocking method.
# Common imports
import os
# Where to save the figures and data files
DATA_ID = "Results/EnergyMin"
def data_path(dat_id):
return os.path.join(DATA_ID, dat_id)
infile = open(data_path("Energies.dat"),'r')
from numpy import std, mean, concatenate, arange, loadtxt, zeros, ceil
from numpy.random import randint
from time import time
def tsboot(data,statistic,R,l):
t = zeros(R); n = len(data); k = int(ceil(float(n)/l));
inds = arange(n); t0 = time()
# time series bootstrap
for i in range(R):
# construct bootstrap sample from
# k chunks of data. The chunksize is l
_data = concatenate([data[j:j+l] for j in randint(0,n-l,k)])[0:n];
t[i] = statistic(_data)
# analysis
print ("Runtime: %g sec" % (time()-t0)); print ("Bootstrap Statistics :")
print ("%8g %14g %15g" % (statistic(data), \
mean(t) - statistic(data), \
std(t) ))
return t
# Read in data
X = loadtxt(infile)
# statistic to be estimated. Takes two args.
# arg1: the data
def stat(data):
return mean(data)
t = tsboot(X, stat, 2**12, 2**10)
The blocking code, based on the article of Marius Jonsson is given here
# Common imports
import os
# Where to save the figures and data files
DATA_ID = "Results/EnergyMin"
def data_path(dat_id):
return os.path.join(DATA_ID, dat_id)
infile = open(data_path("Energies.dat"),'r')
from numpy import log2, zeros, mean, var, sum, loadtxt, arange, array, cumsum, dot, transpose, diagonal, sqrt
from numpy.linalg import inv
def block(x):
# preliminaries
n = len(x)
d = int(log2(n))
s, gamma = zeros(d), zeros(d)
mu = mean(x)
# estimate the auto-covariance and variances
# for each blocking transformation
for i in arange(0,d):
n = len(x)
# estimate autocovariance of x
gamma[i] = (n)**(-1)*sum( (x[0:(n-1)]-mu)*(x[1:n]-mu) )
# estimate variance of x
s[i] = var(x)
# perform blocking transformation
x = 0.5*(x[0::2] + x[1::2])
# generate the test observator M_k from the theorem
M = (cumsum( ((gamma/s)**2*2**arange(1,d+1)[::-1])[::-1] ) )[::-1]
# we need a list of magic numbers
q =array([6.634897,9.210340, 11.344867, 13.276704, 15.086272, 16.811894, 18.475307, 20.090235, 21.665994, 23.209251, 24.724970, 26.216967, 27.688250, 29.141238, 30.577914, 31.999927, 33.408664, 34.805306, 36.190869, 37.566235, 38.932173, 40.289360, 41.638398, 42.979820, 44.314105, 45.641683, 46.962942, 48.278236, 49.587884, 50.892181])
# use magic to determine when we should have stopped blocking
for k in arange(0,d):
if(M[k] < q[k]):
break
if (k >= d-1):
print("Warning: Use more data")
return mu, s[k]/2**(d-k)
x = loadtxt(infile)
(mean, var) = block(x)
std = sqrt(var)
import pandas as pd
from pandas import DataFrame
data ={'Mean':[mean], 'STDev':[std]}
frame = pd.DataFrame(data,index=['Values'])
print(frame)
Why aren't the final averages from the MC sampling the same as those from the Blocking and Bootstrap analysis?
Let us now go through some of the basic theory behind the Bootstrap and the Blocking methods. This means that we need to deal with so-called resampling methods.
Resampling methods are an indispensable tool in modern statistics. They involve repeatedly drawing samples from a training set and refitting a model of interest on each sample in order to obtain additional information about the fitted model. For example, in order to estimate the variability of a linear regression fit, we can repeatedly draw different samples from the training data, fit a linear regression to each new sample, and then examine the extent to which the resulting fits differ. Such an approach may allow us to obtain information that would not be available from fitting the model only once using the original training sample.
Resampling approaches can be computationally expensive, because they involve fitting the same statistical method multiple times using different subsets of the training data. However, due to recent advances in computing power, the computational requirements of resampling methods generally are not prohibitive. In this chapter, we discuss two of the most commonly used resampling methods, cross-validation and the bootstrap. Both methods are important tools in the practical application of many statistical learning procedures. For example, cross-validation can be used to estimate the test error associated with a given statistical learning method in order to evaluate its performance, or to select the appropriate level of flexibility. The process of evaluating a model’s performance is known as model assessment, whereas the process of selecting the proper level of flexibility for a model is known as model selection. The bootstrap is widely used.
Statistical errors can be estimated using standard tools from statistics.
Systematical errors are method specific and must be treated differently from case to case.
The probability distribution function (PDF) is a function p(x) on the domain which, in the discrete case, gives us the probability or relative frequency with which these values of X occur:
p(x)=prob(X=x)
In the continuous case, the PDF does not directly depict the actual probability. Instead we define the probability for the stochastic variable to assume any value on an infinitesimal interval around x to be p(x)dx. The continuous function p(x) then gives us the density of the probability rather than the probability itself. The probability for a stochastic variable to assume any value on a non-infinitesimal interval [a,b] is then just the integral:
prob(a≤X≤b)=∫bap(x)dx
Qualitatively speaking, a stochastic variable represents the values of numbers chosen as if by chance from some specified PDF so that the selection of a large set of these numbers reproduces this PDF.
A particularly useful class of special expectation values are the moments. The n-th moment of the PDF p is defined as follows:
⟨xn⟩≡∫xnp(x)dx
The zero-th moment ⟨1⟩ is just the normalization condition of p. The first moment, ⟨x⟩, is called the mean of p and often denoted by the letter μ:
⟨x⟩=μ≡∫xp(x)dx
A special version of the moments is the set of central moments, the n-th central moment defined as:
⟨(x−⟨x⟩)n⟩≡∫(x−⟨x⟩)np(x)dx
The zero-th and first central moments are both trivial, equal 1 and 0, respectively. But the second central moment, known as the variance of p, is of particular interest. For the stochastic variable X, the variance is denoted as σ2X or var(X):
σ2X = var(X)=⟨(x−⟨x⟩)2⟩=∫(x−⟨x⟩)2p(x)dx=∫(x2−2x⟨x⟩2+⟨x⟩2)p(x)dx=⟨x2⟩−2⟨x⟩⟨x⟩+⟨x⟩2=⟨x2⟩−⟨x⟩2
The square root of the variance, σ=√⟨(x−⟨x⟩)2⟩ is called the standard deviation of p. It is clearly just the RMS (root-mean-square) value of the deviation of the PDF from its mean value, interpreted qualitatively as the spread of p around its mean.
Another important quantity is the so called covariance, a variant of the above defined variance. Consider again the set {Xi} of n stochastic variables (not necessarily uncorrelated) with the multivariate PDF P(x1,…,xn). The covariance of two of the stochastic variables, Xi and Xj, is defined as follows:
cov(Xi,Xj)≡⟨(xi−⟨xi⟩)(xj−⟨xj⟩)⟩=∫⋯∫(xi−⟨xi⟩)(xj−⟨xj⟩)P(x1,…,xn)dx1…dxn
with
⟨xi⟩=∫⋯∫xiP(x1,…,xn)dx1…dxn
If we consider the above covariance as a matrix Cij=cov(Xi,Xj), then the diagonal elements are just the familiar variances, Cii=cov(Xi,Xi)=var(Xi). It turns out that all the off-diagonal elements are zero if the stochastic variables are uncorrelated. This is easy to show, keeping in mind the linearity of the expectation value. Consider the stochastic variables Xi and Xj, (i≠j):
cov(Xi,Xj)=⟨(xi−⟨xi⟩)(xj−⟨xj⟩)⟩=⟨xixj−xi⟨xj⟩−⟨xi⟩xj+⟨xi⟩⟨xj⟩⟩=⟨xixj⟩−⟨xi⟨xj⟩⟩−⟨⟨xi⟩xj⟩+⟨⟨xi⟩⟨xj⟩⟩=⟨xixj⟩−⟨xi⟩⟨xj⟩−⟨xi⟩⟨xj⟩+⟨xi⟩⟨xj⟩=⟨xixj⟩−⟨xi⟩⟨xj⟩
If Xi and Xj are independent, we get ⟨xixj⟩=⟨xi⟩⟨xj⟩, resulting in cov(Xi,Xj)=0 (i≠j).
Also useful for us is the covariance of linear combinations of stochastic variables. Let {Xi} and {Yi} be two sets of stochastic variables. Let also {ai} and {bi} be two sets of scalars. Consider the linear combination:
U=∑iaiXiV=∑jbjYj
By the linearity of the expectation value
cov(U,V)=∑i,jaibjcov(Xi,Yj)
Now, since the variance is just var(Xi)=cov(Xi,Xi), we get the variance of the linear combination U=∑iaiXi:
var(U)=∑i,jaiajcov(Xi,Xj)
And in the special case when the stochastic variables are uncorrelated, the off-diagonal elements of the covariance are as we know zero, resulting in:
var(U)=∑ia2icov(Xi,Xi)=∑ia2ivar(Xi)
var(∑iaiXi)=∑ia2ivar(Xi)
which will become very useful in our study of the error in the mean value of a set of measurements.
A stochastic process is a process that produces sequentially a chain of values:
{x1,x2,…xk,…}.
We will call these values our measurements and the entire set as our measured sample. The action of measuring all the elements of a sample we will call a stochastic experiment since, operationally, they are often associated with results of empirical observation of some physical or mathematical phenomena; precisely an experiment. We assume that these values are distributed according to some PDF pXX(x), where X is just the formal symbol for the stochastic variable whose PDF is pXX(x). Instead of trying to determine the full distribution p we are often only interested in finding the few lowest moments, like the mean μXX and the variance σXX.
In practical situations a sample is always of finite size. Let that size be n. The expectation value of a sample, the sample mean, is then defined as follows:
ˉxn≡1nn∑k=1xk
The sample variance is:
var(x)≡1nn∑k=1(xk−ˉxn)2
its square root being the standard deviation of the sample. The sample covariance is:
cov(x)≡1n∑kl(xk−ˉxn)(xl−ˉxn)
Note that the sample variance is the sample covariance without the cross terms. In a similar manner as the covariance in Eq. (5) is a measure of the correlation between two stochastic variables, the above defined sample covariance is a measure of the sequential correlation between succeeding measurements of a sample.
These quantities, being known experimental values, differ significantly from and must not be confused with the similarly named quantities for stochastic variables, mean μX, variance var(X) and covariance cov(X,Y).
The law of large numbers states that as the size of our sample grows to infinity, the sample mean approaches the true mean μXX of the chosen PDF:
limn→∞ˉxn=μXX
The sample mean ˉxn works therefore as an estimate of the true mean μXX.
What we need to find out is how good an approximation ˉxn is to μXX. In any stochastic measurement, an estimated mean is of no use to us without a measure of its error. A quantity that tells us how well we can reproduce it in another experiment. We are therefore interested in the PDF of the sample mean itself. Its standard deviation will be a measure of the spread of sample means, and we will simply call it the error of the sample mean, or just sample error, and denote it by errXX. In practice, we will only be able to produce an estimate of the sample error since the exact value would require the knowledge of the true PDFs behind, which we usually do not have.
Let us first take a look at what happens to the sample error as the size of the sample grows. In a sample, each of the measurements xi can be associated with its own stochastic variable Xi. The stochastic variable ¯Xn for the sample mean ˉxn is then just a linear combination, already familiar to us:
¯Xn=1nn∑i=1Xi
All the coefficients are just equal 1/n. The PDF of ¯Xn, denoted by p¯Xn(x) is the desired PDF of the sample means.
The probability density of obtaining a sample mean ˉxn is the product of probabilities of obtaining arbitrary values x1,x2,…,xn with the constraint that the mean of the set {xi} is ˉxn:
p¯Xn(x)=∫pXX(x1)⋯∫pXX(xn) δ(x−x1+x2+⋯+xnn)dxn⋯dx1
And in particular we are interested in its variance var(¯Xn).
It is generally not possible to express p¯Xn(x) in a closed form given an arbitrary PDF pXX and a number n. But for the limit n→∞ it is possible to make an approximation. The very important result is called the central limit theorem. It tells us that as n goes to infinity, p¯Xn(x) approaches a Gaussian distribution whose mean and variance equal the true mean and variance, μXX and σ2X, respectively:
limn→∞p¯Xn(x)=(n2πvar(X))1/2e−n(x−ˉxn)22var(X)
The desired variance var(¯Xn), i.e. the sample error squared err2X, is given by:
err2X=var(¯Xn)=1n2∑ijcov(Xi,Xj)
We see now that in order to calculate the exact error of the sample with the above expression, we would need the true means μXXi of the stochastic variables Xi. To calculate these requires that we know the true multivariate PDF of all the Xi. But this PDF is unknown to us, we have only got the measurements of one sample. The best we can do is to let the sample itself be an estimate of the PDF of each of the Xi, estimating all properties of Xi through the measurements of the sample.
Our estimate of μXXi is then the sample mean ˉx itself, in accordance with the the central limit theorem:
μXXi=⟨xi⟩≈1nn∑k=1xk=ˉx
Using ˉx in place of μXXi we can give an estimate of the covariance in Eq. (13)
cov(Xi,Xj)=⟨(xi−⟨xi⟩)(xj−⟨xj⟩)⟩≈⟨(xi−ˉx)(xj−ˉx)⟩,
resulting in
1nn∑l(1nn∑k(xk−ˉxn)(xl−ˉxn))=1n1n∑kl(xk−ˉxn)(xl−ˉxn)=1ncov(x)
By the same procedure we can use the sample variance as an estimate of the variance of any of the stochastic variables Xi
var(Xi)=⟨xi−⟨xi⟩⟩≈⟨xi−ˉxn⟩,
which is approximated as
var(Xi)≈1nn∑k=1(xk−ˉxn)=var(x)
Now we can calculate an estimate of the error errXX of the sample mean ˉxn:
err2X=1n2∑ijcov(Xi,Xj)≈1n2∑ij1ncov(x)=1n2n21ncov(x)=1ncov(x)
which is nothing but the sample covariance divided by the number of measurements in the sample.
In the special case that the measurements of the sample are uncorrelated (equivalently the stochastic variables Xi are uncorrelated) we have that the off-diagonal elements of the covariance are zero. This gives the following estimate of the sample error:
err2X=1n2∑ijcov(Xi,Xj)=1n2∑ivar(Xi),
resulting in
err2X≈1n2∑ivar(x)=1nvar(x)
where in the second step we have used Eq. (14). The error of the sample is then just its standard deviation divided by the square root of the number of measurements the sample contains. This is a very useful formula which is easy to compute. It acts as a first approximation to the error, but in numerical experiments, we cannot overlook the always present correlations.
For computational purposes one usually splits up the estimate of err2X, given by Eq. (15), into two parts
err2X=1nvar(x)+1n(cov(x)−var(x)),
which equals
1n2n∑k=1(xk−ˉxn)2+2n2∑k<l(xk−ˉxn)(xl−ˉxn)
The first term is the same as the error in the uncorrelated case, Eq. (16). This means that the second term accounts for the error correction due to correlation between the measurements. For uncorrelated measurements this second term is zero.
Computationally the uncorrelated first term is much easier to treat efficiently than the second.
var(x)=1nn∑k=1(xk−ˉxn)2=(1nn∑k=1x2k)−ˉx2n
We just accumulate separately the values x2 and x for every measurement x we receive. The correlation term, though, has to be calculated at the end of the experiment since we need all the measurements to calculate the cross terms. Therefore, all measurements have to be stored throughout the experiment.
Let us analyze the problem by splitting up the correlation term into partial sums of the form:
fd=1n−dn−d∑k=1(xk−ˉxn)(xk+d−ˉxn)
The correlation term of the error can now be rewritten in terms of fd
2n∑k<l(xk−ˉxn)(xl−ˉxn)=2n−1∑d=1fd
The value of fd reflects the correlation between measurements separated by the distance d in the sample samples. Notice that for d=0, f is just the sample variance, var(x). If we divide fd by var(x), we arrive at the so called autocorrelation function
κd=fdvar(x)
which gives us a useful measure of pairwise correlations starting always at 1 for d=0.
The sample error (see eq. (17)) can now be written in terms of the autocorrelation function:
err2X=1nvar(x)+2n⋅var(x)n−1∑d=1fdvar(x)=(1+2n−1∑d=1κd)1nvar(x)=τn⋅var(x)
and we see that errX can be expressed in terms the uncorrelated sample variance times a correction factor τ which accounts for the correlation between measurements. We call this correction factor the autocorrelation time:
τ=1+2n−1∑d=1κd
For a correlation free experiment, τ equals 1. From the point of view of eq. (18) we can interpret a sequential correlation as an effective reduction of the number of measurements by a factor τ. The effective number of measurements becomes:
neff=nτ
To neglect the autocorrelation time τ will always cause our simple uncorrelated estimate of err2X≈var(x)/n to be less than the true sample error. The estimate of the error will be too good. On the other hand, the calculation of the full autocorrelation time poses an efficiency problem if the set of measurements is very large.
The so-called time-displacement autocorrelation ϕ(t) for a quantity M is given by
ϕ(t)=∫dt′[M(t′)−⟨M⟩][M(t′+t)−⟨M⟩],
which can be rewritten as
ϕ(t)=∫dt′[M(t′)M(t′+t)−⟨M⟩2],
where ⟨M⟩ is the average value and M(t) its instantaneous value. We can discretize this function as follows, where we used our set of computed values M(t) for a set of discretized times (our Monte Carlo cycles corresponding to moving all electrons?)
ϕ(t)=1tmax−ttmax−t∑t′=0M(t′)M(t′+t)−1tmax−ttmax−t∑t′=0M(t′)×1tmax−ttmax−t∑t′=0M(t′+t).
One should be careful with times close to tmax, the upper limit of the sums becomes small and we end up integrating over a rather small time interval. This means that the statistical error in ϕ(t) due to the random nature of the fluctuations in M(t) can become large.
One should therefore choose t≪tmax.
Note that the variable M can be any expectation values of interest.
The time-correlation function gives a measure of the correlation between the various values of the variable at a time t′ and a time t′+t. If we multiply the values of M at these two different times, we will get a positive contribution if they are fluctuating in the same direction, or a negative value if they fluctuate in the opposite direction. If we then integrate over time, or use the discretized version of, the time correlation function ϕ(t) should take a non-zero value if the fluctuations are correlated, else it should gradually go to zero. For times a long way apart the different values of M are most likely uncorrelated and ϕ(t) should be zero.
We can derive the correlation time by observing that our Metropolis algorithm is based on a random walk in the space of all possible spin configurations. Our probability distribution function ˆw(t) after a given number of time steps t could be written as
ˆw(t)=ˆWtˆw(0),
with ˆw(0) the distribution at t=0 and ˆW representing the transition probability matrix. We can always expand ˆw(0) in terms of the right eigenvectors of ˆv of ˆW as
ˆw(0)=∑iαiˆvi,
resulting in
ˆw(t)=ˆWtˆw(0)=ˆWt∑iαiˆvi=∑iλtiαiˆvi,
with λi the ith eigenvalue corresponding to the eigenvector ˆvi.
If we assume that λ0 is the largest eigenvector we see that in the limit t→∞, ˆw(t) becomes proportional to the corresponding eigenvector ˆv0. This is our steady state or final distribution.
We can relate this property to an observable like the mean energy. With the probabilty ˆw(t) (which in our case is the squared trial wave function) we can write the expectation values as
⟨M(t)⟩=∑μˆw(t)μMμ,
or as the scalar of a vector product
⟨M(t)⟩=ˆw(t)m,
with m being the vector whose elements are the values of Mμ in its various microstates μ.
We rewrite this relation as
⟨M(t)⟩=ˆw(t)m=∑iλtiαiˆvimi.
If we define mi=ˆvimi as the expectation value of M in the ith eigenstate we can rewrite the last equation as
⟨M(t)⟩=∑iλtiαimi.
Since we have that in the limit t→∞ the mean value is dominated by the the largest eigenvalue λ0, we can rewrite the last equation as
⟨M(t)⟩=⟨M(∞)⟩+∑i≠0λtiαimi.
We define the quantity
τi=−1logλi,
and rewrite the last expectation value as
⟨M(t)⟩=⟨M(∞)⟩+∑i≠0αimie−t/τi.
The quantities τi are the correlation times for the system. They control also the auto-correlation function discussed above. The longest correlation time is obviously given by the second largest eigenvalue τ1, which normally defines the correlation time discussed above. For large times, this is the only correlation time that survives. If higher eigenvalues of the transition matrix are well separated from λ1 and we simulate long enough, τ1 may well define the correlation time. In other cases we may not be able to extract a reliable result for τ1. Coming back to the time correlation function ϕ(t) we can present a more general definition in terms of the mean magnetizations $ \langle \mathbf{M}(t) \rangle$. Recalling that the mean value is equal to $ \langle \mathbf{M}(\infty) \rangle$ we arrive at the expectation values
ϕ(t)=⟨M(0)−M(∞)⟩⟨M(t)−M(∞)⟩,
resulting in
ϕ(t)=∑i,j≠0miαimjαje−t/τi,
which is appropriate for all times.
If the correlation function decays exponentially
ϕ(t)∼exp(−t/τ)
then the exponential correlation time can be computed as the average
τexp=−⟨tlog|ϕ(t)ϕ(0)|⟩.
If the decay is exponential, then
∫∞0dtϕ(t)=∫∞0dtϕ(0)exp(−t/τ)=τϕ(0),
which suggests another measure of correlation
τint=∑kϕ(k)ϕ(0),
called the integrated correlation time.
Two famous resampling methods are the independent bootstrap and the jackknife.
The jackknife is a special case of the independent bootstrap. Still, the jackknife was made popular prior to the independent bootstrap. And as the popularity of the independent bootstrap soared, new variants, such as the dependent bootstrap.
The Jackknife and independent bootstrap work for independent, identically distributed random variables. If these conditions are not satisfied, the methods will fail. Yet, it should be said that if the data are independent, identically distributed, and we only want to estimate the variance of ¯X (which often is the case), then there is no need for bootstrapping.
The Jackknife works by making many replicas of the estimator ˆθ. The jackknife is a resampling method, we explained that this happens by scrambling the data in some way. When using the jackknife, this is done by systematically leaving out one observation from the vector of observed values ˆx=(x1,x2,⋯,Xn). Let ˆxi denote the vector
ˆxi=(x1,x2,⋯,xi−1,xi+1,⋯,xn),
which equals the vector ˆx with the exception that observation number i is left out. Using this notation, define ˆθi to be the estimator ˆθ computed using →Xi.
To get an estimate for the bias and standard error of ˆθ, use the following estimators for each component of ˆθ
^Bias(ˆθ,θ)=(n−1)(−ˆθ+1nn∑i=1ˆθi)andˆσ2ˆθ=n−1nn∑i=1(ˆθi−1nn∑j=1ˆθj)2.
from numpy import *
from numpy.random import randint, randn
from time import time
def jackknife(data, stat):
n = len(data);t = zeros(n); inds = arange(n); t0 = time()
## 'jackknifing' by leaving out an observation for each i
for i in range(n):
t[i] = stat(delete(data,i) )
# analysis
print("Runtime: %g sec" % (time()-t0)); print("Jackknife Statistics :")
print("original bias std. error")
print("%8g %14g %15g" % (stat(data),(n-1)*mean(t)/n, (n*var(t))**.5))
return t
# Returns mean of data samples
def stat(data):
return mean(data)
mu, sigma = 100, 15
datapoints = 10000
x = mu + sigma*random.randn(datapoints)
# jackknife returns the data sample
t = jackknife(x, stat)
Bootstrapping is a nonparametric approach to statistical inference that substitutes computation for more traditional distributional assumptions and asymptotic results. Bootstrapping offers a number of advantages:
Since ˆθ=ˆθ(ˆX) is a function of random variables, ˆθ itself must be a random variable. Thus it has a pdf, call this function p(ˆt). The aim of the bootstrap is to estimate p(ˆt) by the relative frequency of ˆθ. You can think of this as using a histogram in the place of p(ˆt). If the relative frequency closely resembles p(→t), then using numerics, it is straight forward to estimate all the interesting parameters of p(ˆt) using point estimators.
In the case that ˆθ has more than one component, and the components are independent, we use the same estimator on each component separately. If the probability density function of Xi, p(x), had been known, then it would have been straight forward to do this by:
By repeated use of (1) and (2), many estimates of ˆθ could have been obtained. The idea is to use the relative frequency of ˆθ∗ (think of a histogram) as an estimate of p(ˆt).
But unless there is enough information available about the process that generated X1,X2,⋯,Xn, p(x) is in general unknown. Therefore, Efron in 1979 asked the question: What if we replace p(x) by the relative frequency of the observation Xi; if we draw observations in accordance with the relative frequency of the observations, will we obtain the same result in some asymptotic sense? The answer is yes.
Instead of generating the histogram for the relative frequency of the observation Xi, just draw the values (X∗1,X∗2,⋯,X∗n) with replacement from the vector ˆX.
The independent bootstrap works like this:
When you are done, you can draw a histogram of the relative frequency of ˆθ∗. This is your estimate of the probability distribution p(t). Using this probability distribution you can estimate any statistics thereof. In principle you never draw the histogram of the relative frequency of ˆθ∗. Instead you use the estimators corresponding to the statistic of interest. For example, if you are interested in estimating the variance of ˆθ, apply the etsimator ˆσ2 to the values ˆθ∗.
The following code starts with a Gaussian distribution with mean value μ=100 and variance σ=15. We use this to generate the data used in the bootstrap analysis. The bootstrap analysis returns a data set after a given number of bootstrap operations (as many as we have data points). This data set consists of estimated mean values for each bootstrap operation. The histogram generated by the bootstrap method shows that the distribution for these mean values is also a Gaussian, centered around the mean value μ=100 but with standard deviation σ/√n, where n is the number of bootstrap samples (in this case the same as the number of original data points). The value of the standard deviation is what we expect from the central limit theorem.
%matplotlib inline
from numpy import *
from numpy.random import randint, randn
from time import time
from scipy.stats import norm
import matplotlib.pyplot as plt
# Returns mean of bootstrap samples
def stat(data):
return mean(data)
# Bootstrap algorithm
def bootstrap(data, statistic, R):
t = zeros(R); n = len(data); inds = arange(n); t0 = time()
# non-parametric bootstrap
for i in range(R):
t[i] = statistic(data[randint(0,n,n)])
# analysis
print("Runtime: %g sec" % (time()-t0)); print("Bootstrap Statistics :")
print("original bias std. error")
print("%8g %8g %14g %15g" % (statistic(data), std(data),\
mean(t), \
std(t)))
return t
mu, sigma = 100, 15
datapoints = 10000
x = mu + sigma*random.randn(datapoints)
# bootstrap returns the data sample t = bootstrap(x, stat, datapoints)
# the histogram of the bootstrapped data
t = bootstrap(x, stat, datapoints)
# the histogram of the bootstrapped data
n, binsboot, patches = plt.hist(t, bins=50, density='true',histtype='bar', color='red', alpha=0.75)
# add a 'best fit' line
y = norm.pdf( binsboot, mean(t), std(t))
lt = plt.plot(binsboot, y, 'r--', linewidth=1)
plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.axis([99.5, 100.6, 0, 3.0])
plt.grid(True)
plt.show()
The blocking method was made popular by Flyvbjerg and Pedersen (1989) and has become one of the standard ways to estimate V(ˆθ) for exactly one ˆθ, namely ˆθ=¯X.
Assume n=2d for some integer d>1 and X1,X2,⋯,Xn is a stationary time series to begin with. Moreover, assume that the time series is asymptotically uncorrelated. We switch to vector notation by arranging X1,X2,⋯,Xn in an n-tuple. Define:
ˆX=(X1,X2,⋯,Xn).
The strength of the blocking method is when the number of observations, n is large. For large n, the complexity of dependent bootstrapping scales poorly, but the blocking method does not, moreover, it becomes more accurate the larger n is.
We now define blocking transformations. The idea is to take the mean of subsequent pair of elements from →X and form a new vector →X1. Continuing in the same way by taking the mean of subsequent pairs of elements of →X1 we obtain →X2, and so on. Define →Xi recursively by:
(→X0)k≡(→X)k(→Xi+1)k≡12((→Xi)2k−1+(→Xi)2k)for all1≤i≤d−1
The quantity →Xk is subject to k blocking transformations. We now have d vectors →X0,→X1,⋯,→Xd−1 containing the subsequent averages of observations. It turns out that if the components of →X is a stationary time series, then the components of →Xi is a stationary time series for all 0≤i≤d−1
We can then compute the autocovariance, the variance, sample mean, and number of observations for each i. Let γi,σ2i,¯Xi denote the autocovariance, variance and average of the elements of →Xi and let ni be the number of elements of →Xi. It follows by induction that ni=n/2i.
Using the definition of the blocking transformation and the distributive property of the covariance, it is clear that since h=|i−j| we can define
γk+1(h)=cov((Xk+1)i,(Xk+1)j)=14cov((Xk)2i−1+(Xk)2i,(Xk)2j−1+(Xk)2j)=12γk(2h)+12γk(2h+1)h=0=14γk(2h−1)+12γk(2h)+14γk(2h+1)else
The quantity ˆX is asymptotic uncorrelated by assumption, ˆXk is also asymptotic uncorrelated. Let's turn our attention to the variance of the sample mean V(¯X).
We have
V(¯Xk)=σ2knk+2nknk−1∑h=1(1−hnk)γk(h)⏟≡ek=σ2knk+ekifγk(0)=σ2k.
The term ek is called the truncation error:
ek=2nknk−1∑h=1(1−hnk)γk(h).
We can show that V(¯Xi)=V(¯Xj) for all 0≤i≤d−1 and 0≤j≤d−1.
We can then wrap up
nj+1¯Xj+1=nj+1∑i=1(ˆXj+1)i=12nj/2∑i=1(ˆXj)2i−1+(ˆXj)2i=12[(ˆXj)1+(ˆXj)2+⋯+(ˆXj)nj]=nj2⏟=nj+1¯Xj=nj+1¯Xj.
By repeated use of this equation we get V(¯Xi)=V(¯X0)=V(¯X) for all 0≤i≤d−1. This has the consequence that
V(¯X)=σ2knk+ekfor all0≤k≤d−1.
Flyvbjerg and Petersen demonstrated that the sequence {ek}d−1k=0 is decreasing, and conjecture that the term ek can be made as small as we would like by making k (and hence d) sufficiently large. The sequence is decreasing (Master of Science thesis by Marius Jonsson, UiO 2018). It means we can apply blocking transformations until ek is sufficiently small, and then estimate V(¯X) by ˆσ2k/nk.
For an elegant solution and proof of the blocking method, see the recent article of Marius Jonsson (former MSc student of the Computational Physics group).