Mainly based on Wooldridge (2019), Chapter 5

Code
# Function for Monte-Carlo simulation for regressions with one or two variables

sim <- function(n=100, rep=1000, nvar=1, arcoef=0, sige=1, sigu1=1, sigu2=1, correu1=0, correu2=0, corru1u2=0, autocorr=0, debug=0, distr=0, histdistr=0) {
  
  # start clock
  starttime <- proc.time()
  
  # true parameters
  B0 = 1
  B1 = 0.5
  B2 = 0.5
  
  OLS  <- vector(mode = "list", length = rep) # initialing list for storing results
  SOLS <- vector(mode = "list", length = rep) # initialing list for storing results
  
  ######################### rep loop #################################################
  for (i in (1:rep)) {
    
    if (nvar==1) {
      
      if (distr==0) {
        u1 = rnorm(n+10, mean = 0, sd = 1)
        e =  rnorm(n+10, mean = 0, sd = 1)        
      }
      
      if (distr==1) {
        u1 = rnorm(n+10, mean = 0, sd = 1)
        e =  (rchisq(n+10, 2) - 2)/2
      }
      
      
      
      covmat = matrix( c(sige^2, correu1*sige*sigu1, correu1*sige*sigu1, sigu1^2), ncol = 2)
      M <- t(chol(covmat)) # untere Dreiecksmatrix
      Z = t( M %*% t( cbind(e, u1) ) )  # covmat = M cov(e,u1) M'
    
      x1 = filter(Z[,2], arcoef, method = "recursive")  # AR(1) process
      tsp(x1) <- NULL   # remove ts property, for better plotting
      maxx1 = max(x1)
      minx1 = min(x1)
      
      e = filter(Z[,1], autocorr, method = "recursive")
      tsp(e) <- NULL   # remove ts property, for better plotting
      y = B0 + B1*x1 + e
      maxy = max(y)
      miny = min(y)
      
      y = y[11:n+10]; x1 = x1[11:n+10]; 
      
      OLS[[i]] = lm(y ~ x1, model = FALSE)
      
    }
    
    if (nvar==2) {
      
      if (distr==0) {
        u1 = rnorm(n+10, mean = 0, sd = 1)
        u2 = rnorm(n+10, mean = 0, sd = 1) 
        e =  rnorm(n+10, mean = 0, sd = 1)        
      }
      
      if (distr==1) {
        u1 = rnorm(n+10, mean = 0, sd = 1)
        u2 = rnorm(n+10, mean = 0, sd = 1) 
        e =  (rchisq(n+10, 2) - 2)/2
      }
      

      covmat = matrix( c(sige^2, correu1*sige*sigu1, correu2*sige*sigu2,   correu1*sige*sigu1, sigu1^2, corru1u2*sigu1*sigu2,  correu2*sige*sigu2, corru1u2*sigu1*sigu2, sigu2^2 ), ncol = 3)
      
      M <- t(chol(covmat)) # untere Dreiecksmatrix
      Z = t( M %*% t( cbind(e, u1, u2) ) )  # covmat = M cov(e,u1,u2) M'
      
      x1 = filter(Z[,2], arcoef, method = "recursive")  # AR(1) process
      tsp(x1) <- NULL   # remove ts property, for better ploting
      maxx1 = max(x1)
      minx1 = min(x1)
      
      x2 = filter(Z[,3], arcoef, method = "recursive")
      tsp(x2) <- NULL   # remove ts property, for better ploting
      
      e = filter(Z[,1], autocorr, method = "recursive")
      tsp(e) <- NULL   # remove ts property, for better ploting
      y = B0 + B1*x1 + B2*x2 + e
      maxy = max(y)
      miny = min(y)
      
      y = y[11:n+10]; x1 = x1[11:n+10]; x2 = x2[11:n+10]; 
      
      OLS[[i]] = lm(y ~ x1 + x2, model = FALSE)
    }
    
    SOLS[[i]] = summary( OLS[[i]] )
  }
  ########################## end rep loop ############################################
  
  
  # rep = 1: scatterplot with true and estimated regline
  if (rep==1) {
    plot(y ~ x1, col="blue")
    abline(OLS[[i]], col="blue")
    abline(c(B0,B1), col="red")
  }

  
  # rep > 1 and <= 100: true and up to 100 estimated reglines
  if (rep>1 & rep<=100) {
    plot(NULL, xlim = c(minx1*1.1, maxx1*1.1), ylim = c(miny*1.1, maxy*1.1))
    for (i in 1:rep) abline(OLS[[i]], col="grey")
    abline(c(B0,B1), col="red")
  }
  
  
  # rep > 100: histograms of estimated parameters and sigma
  if (rep > 100) {
    BB = c(B0, B1, B2)
    par(mfrow = c(2, 2))
    
    for (i in (0:nvar)) {
      bb = paste0("b",i)
      eval( parse( text=paste0(bb,"<-", mean(unlist(lapply(OLS, function(x) coef(x)[i+1])))) ) ) 

      if (histdistr==0) {
        hist(unlist(lapply(OLS, function(x) coef(x)[i+1])), freq = FALSE, breaks = 26, xlim = c( eval(parse(text=bb))-0.5, eval(parse(text=bb))+0.5 ), main = paste0("Distribution of b",i), xlab = bb) 
      }
      if (histdistr==1) {
        bdistri <- unlist(lapply(OLS, function(x) coef(x)[i+1]))
        hist(bdistri, freq = FALSE, breaks = 30,  main = paste0("Distribution of b",i), xlab = bb) 
        curve(dnorm(x, mean = mean(bdistri),sd=sd(bdistri)),add = TRUE, col = "blue")
      }
      abline(v=BB[i+1], col = "red")
      
    }
    
    
    
    # b0 =  mean(unlist(lapply(OLS, function(x) coef(x)[1]))) 
    # if (histdistr==0) {
    # hist(unlist(lapply(OLS, function(x) coef(x)[1])), freq = FALSE, breaks = 26, xlim = c(b0-0.5, b0+0.5), main = "Distribution of b0", xlab = "b0") 
    # }
    # if (histdistr==1) {
    # hist(unlist(lapply(OLS, function(x) coef(x)[1])), freq = FALSE, breaks = 30,  main = "Distribution of b0", xlab = "b0") 
    # }
    # abline(v=B0, col = "red")
    # 
    # b1 =  mean(unlist(lapply(OLS, function(x) coef(x)[2]))) 
    # if (histdistr==0) {
    # hist(unlist(lapply(OLS, function(x) coef(x)[2])), freq = FALSE, breaks = 26, xlim = c(b1-0.5 ,b1+0.5), main = "Distribution of b1", xlab = "b1") 
    # }
    # if (histdistr==1) {
    # hist(unlist(lapply(OLS, function(x) coef(x)[2])), freq = FALSE, breaks = 30,  main = "Distribution of b1", xlab = "b1") 
    # }
    # abline(v=B1, col = "red")
    # 
    # if (nvar==2) {
    #   b2 =  mean(unlist(lapply(OLS, function(x) coef(x)[3]))) 
    #   hist(unlist(lapply(OLS, function(x) coef(x)[3])), freq = FALSE, breaks = 26, xlim = c(b2-0.5, b2+0.5), main = "Distribution of b2", xlab = "b2") 
    #   abline(v=B2, col = "red") 
    # }
    
    s =  mean(unlist(lapply(SOLS, function(x) x$sigma))) 
    if (histdistr==0) {
    hist(unlist(lapply(SOLS, function(x) x$sigma)), freq = FALSE, breaks = 26, xlim = c(s-0.5, s+0.5), main = "Distribution of sigma", xlab = "sigma") 
    }
    
    if (histdistr==1) {
    sigdistri <- unlist(lapply(SOLS, function(x) x$sigma))
    hist(sigdistri, freq = FALSE, breaks = 30,  main = "Distribution of sigma", xlab = "sigma") 
    curve(dnorm(x, mean = mean(sigdistri),sd=sd(sigdistri)),add = TRUE, col = "blue")
    }
    abline(v=sqrt(sige^2/(1-autocorr^2)), col = "red")
    
    par(mfrow = c(1, 1))
  }
  
  # result table
  if (rep > 100) {
    results <- data.frame(Mean_b1 = b1)
    results$sd_b1 <- sd(unlist(lapply(OLS, function(x) coef(x)[2]))) 
    results$mean_se_b1 <- mean(unlist(lapply(SOLS, function(x) x$coef[2,2]))) 
    results$size_b1_0.05 <- sum( unlist(lapply(SOLS, function(x) abs(x$coef[2,1]-B1)/x$coef[2,2] )) > 1.96) / rep
    
    if (nvar == 2) {
      results$Mean_b2 <-  mean(unlist(lapply(OLS, function(x) coef(x)[3]))) 
      results$sd_b2 <- sd(unlist(lapply(OLS, function(x) coef(x)[3]))) 
      results$mean_se_b2 <- mean(unlist(lapply(SOLS, function(x) x$coef[3,2]))) 
      results$size_b2_0.05 <- sum( unlist(lapply(SOLS, function(x) abs(x$coef[3,1]-B2)/x$coef[3,2] )) > 1.96) / rep
    }
    results$mean_sigma <- s
    results$sd_sigma <- sd(unlist(lapply(SOLS, function(x) x$sigma))) 
    
    stargazer::stargazer(results, type = "text", summary = FALSE, rownames = FALSE)
  }
  

  if (debug==1) {
    print("covmat")
    print(covmat)
    print("chol")
    print(M)
    print("MM' = covmat")
    print(M %*% t(M))
    print("colMeans")
    print(colMeans(Z))
    print( "new covmat")
    print( cov(Z) )
    print( "new cormat" )
    print( cor(Z) )
  }
  
# Stop the clock
endtime <- proc.time()
print(endtime-starttime)

}

4.1 Motivation

So far we focused on properties of OLS that hold for any sample
  • Properties of OLS that hold for any sample/sample size (see Section 2.7)

    • Expected values/unbiasedness under MLR.1 – MLR.4

    • Variance formulas under MLR.1 – MLR.5

    • Gauss-Markov Theorem under MLR.1 – MLR.5

    • Exact sampling distributions/tests under MLR.1 – MLR.6

  • Properties of OLS that hold in large samples

    • Consistency under MLR.1 – MLR.4’

    • Asymptotic normality/tests under MLR.1 – MLR.4’


4.2 Consistency

  • An estimator \hat\theta_n for a sample size n is consistent for a population (true) parameter \theta if P(|\hat\theta_n - \theta| < \epsilon) \rightarrow 1, \ \text{ for arbitrary } \epsilon > 0 \ \text{ and } \ n \rightarrow \infty

  • Alternative notations: probability limit of \hat \theta is \theta: \operatorname{plim}(\hat\theta) = \theta

  • or, \hat \theta converges in probability to \theta \hat \theta \ \stackrel{p}{\longrightarrow} \ \theta

  • Interpretation

  • Consistency means that the probability that the estimate is arbitrarily close to the true population value can be made arbitrarily high by increasing the sample size

  • Loosely speaking that means that with larger sample size the estimate becomes more precise and eventually, the estimate converges to the true parameter value

  • Consistency is the most important requirement for sensible estimators


4.2.1 Consistency of OLS

Theorem 4.1 (Consistency of \hat {\boldsymbol \beta} and \hat \sigma^2) Under MLR.1 - MLR.4 we have

\operatorname{plim}(\hat{\boldsymbol{\beta}}) = \boldsymbol\beta

Under MLR.1 - MLR.5 we have

\operatorname{plim}({\hat \sigma^2}) = \sigma^2

Important Remark, especially for time series:

Assumption MLR.4, E(u|\mathbf X) = 0 or E(u_i|\mathbf x_i) = 0 plus random sampling can be relaxed to

Assumption MLR.4’: E(u_i|\mathbf x_i) = 0 without random sampling or even more to E(u)=0 and \operatorname{Cov}(\mathbf x_i, u_i)=0.


4.2.2 Proof of \operatorname{plim}(\hat{\boldsymbol{\beta}}) = \boldsymbol\beta

  • Our true model (population model or data generating process) is:

\mathbf y = \mathbf X \boldsymbol\beta + \mathbf u \tag{4.1}

  • This leads to the OLS estimator for \boldsymbol \beta:

\hat{\boldsymbol{\beta}}=(\mathbf{X}' \mathbf{X})^{-1} \mathbf{X}' \mathbf{y} \tag{4.2}

\hat{\boldsymbol{\beta}} \ = \ \boldsymbol{\beta}+\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}' \mathbf{u} \tag{4.3}

  • Dividing and multiplying the last term by n we get:

\hat{\boldsymbol{\beta}} \ = \ \boldsymbol{\beta}+\left(\dfrac{\mathbf{X}' \mathbf{X}}{n} \right)^{-1} \left( \dfrac{\mathbf{X}' \mathbf{u}}{n} \right) \ =

= \ \boldsymbol{\beta}+\left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right )^{-1} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x'_i u_i \right ) \tag{4.4}

  • Now we can apply the Law of Large Numbers (LLN, Theorem A.2) i.e., empirical moments (averages) converge in probability to the expected values of their summands under quite general conditions

    1. The expected value of the first summand is E(\mathbf x_i' \mathbf x_i) := \mathbf m_{xx}, the moment matrix of the variables in \mathbf X (the second moments have to exit, which is a requirement for the LLN)

    2. The expected value of the second summand is by MLR.4’: E(\mathbf x'_i u_i) := \mathbf m_{xu} = E_x(E(\mathbf x'_i u_i | \mathbf x_i)) = E_x(\mathbf 0) = \mathbf 0

  • Hence, we have for Equation 4.4 (by Slutsky’s theorem, see Theorem A.4, especially Equation A.9 and Equation A.10):

\operatorname{plim} \hat{\boldsymbol{\beta}} \ = \ \boldsymbol{\beta} + \operatorname{plim} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right )^{-1} \operatorname{plim} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x'_i u_i \right ) \ =

= \ \boldsymbol{\beta} + \mathbf m_{xx}^{-1} \, \mathbf m_{xu} \ = \ \boldsymbol{\beta} + \mathbf m_{xx}^{-1} \, \mathbf 0 \ = \ \boldsymbol{\beta} \tag{5.5}

q.e.d.


Proof of \operatorname{plim}({\hat \sigma^2}) = \sigma^2

We have:

\hat\sigma^2 = \frac{1}{n-k-1} \sum_{i=1}^n \hat u_i^2 = \frac{1}{n-k-1} \, \hat{\mathbf u}' \hat{\mathbf u} = \frac{1}{n-k-1} \, \mathbf y' \mathbf {M_x} \mathbf y = \frac{1}{n-k-1} \, \mathbf u' \mathbf {M_x} \mathbf u

The fourth term follows directly from the definition and idempotence of \mathbf M (see the discussion of the FWL theorem, Section 2.5.1 or Equation C.10) and for the last term we have:

\mathbf {M_x} \mathbf y = \mathbf {M_x} (\underbrace{\mathbf X \boldsymbol{\beta}+\mathbf u}_\mathbf y ) = \underbrace{\mathbf {M_x} \mathbf X}_\mathbf 0 \, \boldsymbol{\beta} + \mathbf {M_x} \mathbf u = \mathbf {M_x} \mathbf u

So, we finally need the plim of:

\hat\sigma^2 = \frac{1}{n-k-1} \, \mathbf u' \mathbf {M_x} \mathbf u = \frac{1}{n-k-1} \mathbf u' ( \underbrace{ \mathbf I - \mathbf X(\mathbf X '\mathbf X)^{-1} \mathbf X' }_\mathbf{M_x} ) \mathbf u =

=\frac{n}{n-k-1}\left[\frac{\mathbf{u}^{\prime} \mathbf{u}}{n}-\left(\frac{\mathbf{u}^{\prime} \mathbf{X}}{n}\right)\left(\frac{\mathbf{X} \mathbf{X}}{n}\right)^{-1}\left(\frac{\mathbf{X} \mathbf{u}}{n}\right)\right] \ \ \Rightarrow

\operatorname{plim} \hat\sigma^2 \ =

\underbrace{\operatorname{plim}\left[\frac{n}{n-k-1}\right]}_{1} \left[ \operatorname{plim}\left[\frac{\mathbf{u}^{\prime} \mathbf{u}}{n}\right]-\operatorname{plim}\underbrace{\left[\frac{\mathbf{u}^{\prime} \mathbf{X}}{n}\right]}_{\mathbf {\bar m_{xu}}\rightarrow \mathbf 0} \operatorname{plim}\underbrace{\left[\left(\frac{\mathbf{X} \mathbf{X}}{n}\right)^{-1}\right]}_{\mathbf {\bar m_{xx}^{-1}}\rightarrow \mathbf {m_{xx}^{-1}}} \operatorname{plim}\underbrace{\left[\frac{\mathbf{X}^{\prime} \mathbf{u}}{n}\right]}_{\mathbf {\bar m_{xu}}\rightarrow \mathbf 0} \right ] =

\operatorname{plim} \left[ \frac{\mathbf{u}^{\prime} \mathbf{u}}{n} \right] \ = \ \operatorname{plim} \left[ \frac{1}{n} \sum_{i=1}^{n} u_{i}^{2} \right]

The last term is an average of a sequence of random numbers and, according to the LLN, converges to it’s expected value:

E(u_i^2) = E_x [ E(u_i^2 | \mathbf x_i)] = \sigma^2

In the case of heteroscedasticity we would have: E(u_i^2) = E_x [ E(u_i^2 | \mathbf x_i)] = E_x(\sigma_i^2) = \bar \sigma^2.

q.e.d.


4.2.3 Some examples - a Monte-Carlo simulation experiment

In the following we simulate a regression model to demonstrate the concept of consistency

  • The model is:

y_i = 1 + 0.5 x_i + u_i

  • In the first step, we generate a sample of 100 random draws of u_i and x_i, with Var(\ u_i) = 1, Var(x_i) = 1 and Cov(u_i, x_i) = 0 to calculate y_i. Then we estimate the model above with these simulated data by OLS to get \hat\beta_0, \hat\beta_1 and \hat\sigma^2. The procedure is repeated 5000 times and the results for \hat\beta_0, \hat\beta_1 and \hat\sigma^2 are shown as histogram to display the distribution for the estimated parameters

  • In the second, step we repeat the whole procedure, but this time with a sample length of 30000. As one can see the distributions of the estimators collapse to a spike over the true parameter values, which are marked by the vertical red lines. The estimators are consistent!

  • In the third and fourth step, the two steps above are repeated, but this time with Cov(u_i, x_i) = 0.3. Hence, E(u_i | \mathbf x_i) \neq 0, which means our assumption MLR.4’ is not met

  • The histograms of the estimators clearly show, that \hat\beta_1 and \hat\sigma^2 are not consistent any more, although the correlation between u_i and x_i is quite moderate


ret <- sim(n=100,rep=5000,nvar=1)

=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.501   0.106   0.107       0.054       0.998     0.075  
---------------------------------------------------------
   user  system elapsed 
  1.747   0.017   1.776 

Figure 4.1: Simulation with n=100 and 5000 replications

ret <- sim(n=30000,rep=1000,nvar=1)

=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.500   0.006   0.006       0.051       1.000     0.004  
---------------------------------------------------------
   user  system elapsed 
 82.296  22.735  13.539 

Figure 4.2: Simulation with n=30000 and 1000 replications

ret <- sim(n=100,rep=5000,nvar=1,correu1=0.3)

=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.801   0.103   0.102       0.839       0.950     0.071  
---------------------------------------------------------
   user  system elapsed 
  1.904   0.110   1.709 

Figure 4.3: Simulation with n=100 and 5000 replications, E(u|x) != 0

ret <- sim(n=30000,rep=1000,nvar=1,correu1=0.3)

=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.800   0.006   0.006         1         0.954     0.004  
---------------------------------------------------------
   user  system elapsed 
 80.888  19.745  13.158 

Figure 4.4: Simulation with n=30000 and 1000 replications, E(u|x) != 0

4.3 Asymptotic normality

  • For carrying out statistical test we needed LRM.6 - normality of the error term. But this assumption is often questionable, and in some cases even impossible to be met, e.g. if the depended variable is restricted to a certain range

  • If MLR.6 does not hold, the results of t- or F-tests may be wrong

  • Fortunately, F- and t-tests still work if the sample size is large enough

  • Also, OLS estimates are normal in large samples even without MLR.6

Hence, we can state the following theorem:

Theorem 4.2 (Asymptotic normality of \hat {\boldsymbol \beta}) Under assumptions MLR.1 – MLR.5:

\sqrt{n}(\hat {\boldsymbol{\beta}} - \boldsymbol{\beta}) \ \stackrel{d}{\longrightarrow} \ N \left( \mathbf 0, \ \sigma^2 \mathbf m_{\mathbf {xx}}^{-1} \right) And a consistent estimate of the asymptotic distribution of \hat{\boldsymbol \beta} is:

\hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ \hat \sigma^2 (\mathbf X' \mathbf X)^{-1} \right)

In particular, this trivially implies:

\dfrac{(\hat\beta_j - \beta_j)}{ se(\hat\beta_j)} \ \, \stackrel{a}{\sim} \ \, N(0,\, 1)

And furthermore, if E(\mathbf{R} \hat{\boldsymbol{\beta}}) = \mathbf{c}) and \mathbf R has m rows:

W \ = \ (\mathbf{R} \hat{\boldsymbol{\beta}}-\mathbf{c})^{\prime}\left(\sigma^{2} \, \mathbf{R}(\mathbf{X} \mathbf{X})^{-1} \mathbf{R}^{\prime}\right)^{-1}(\mathbf{R} \hat{\boldsymbol{\beta}}-\mathbf{c}) \ \, \stackrel{a}{\sim} \ \, \chi^{2}(m)


4.3.1 Proof of asymptotic normality

Once more, our starting point is representation Equation 4.4 from the consistency proof, Section 4.2.2

\hat{\boldsymbol{\beta}} - \boldsymbol{\beta} \ = \ \left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right )^{-1} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x'_i u_i \right )

  • From the consistency proof we know that the whole term converges in probability to \mathbf 0. Therefore, we multiply by \sqrt n, so that the term could converge in distribution to a random vector:

\sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) \ = \ \underbrace{ \left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right)^{-1}}_{\stackrel{p}{\longrightarrow} \ \mathbf {m_{xx}^{-1}}} \underbrace{\left( \frac{1}{\sqrt{n}} \sum_{i=1}^n \mathbf x'_i u_i \right ) }_{\stackrel{d}{\longrightarrow} \ N(0,\, \sigma^2 \mathbf {m_{xx}}) } \tag{4.5}

  • Under our standard assumptions and according to the LLN, the first term converges in probability to E(\mathbf x_i' \mathbf x_i )^{-1} := \mathbf m_{\mathbf {xx}}^{-1}

  • The more interesting term of Equation 4.5 is the second one. This term does not converge to \mathbf 0 any more, as now we divide by \sqrt n and not by n, as it was the case in the consistency proof

  • According to the Central Limit Theorem (CLT, Theorem A.3) the second term, which is a sum of arbitrarily distributed random vectors divided by \sqrt n, converges in distribution under quite similar conditions as we had with the LLN to a normally distributed random vector with expected value

E(\mathbf x_i' u_i) = E_x[\mathbf x_i' E(u_i | \mathbf x_i) ] = \mathbf 0

  • and because \operatorname{Cov}(\mathbf x_i' u_i, \mathbf x_j' u_j) = 0, random sampling, we can write for the covariance matrix

\operatorname {Var}(\mathbf x_i' u_i) = E( \mathbf x_i' u_i u_i \mathbf x_i ) = E_x [ E(\mathbf x_i' u_i u_i \mathbf x_i | \mathbf x_i) ] = E_x[ \mathbf x_i' E(u_i^2 | \mathbf x_i) \mathbf x_i )] = \sigma^2 E_x[ \mathbf x_i' \mathbf x_i ] = \sigma^2 \mathbf m_{\mathbf {xx}}

  • Hence, because of the convergence in probability of the first term of Equation 4.5 and the covariance formula from Equation C.7, the covariance matrix of \sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) is:

\operatorname{Var} \left( \sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) \right) \ = \ \sigma^2 \, \mathbf m_{\mathbf {xx}}^{-1} \, \mathbf m_{\mathbf {xx}} \, \mathbf m_{\mathbf {xx}}'^{-1} \ = \ \sigma^2 \, \mathbf m_{\mathbf {xx}}^{-1}


So we have the fundamental result about the limiting distribution:

\sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) \ \stackrel{d}{\longrightarrow} \ N \left(\mathbf 0, \ \sigma^2 \mathbf m_{\mathbf {xx}}^{-1} \right) \tag{4.6}

Or equivalent, about the asymptotic distribution (which is an approximation to the true, but unknown distribution of \hat{\boldsymbol{\beta}} at finite but large samples):

\hat{\boldsymbol{\beta}} \ \, \stackrel{a}{\sim} \, \, N \left(\boldsymbol{\beta}, \ \sigma^2 \mathbf m_{\mathbf {xx}}^{-1} / n\right) \tag{4.7}

  • As \mathbf m_{\mathbf {xx}}:=E(\mathbf x'_i \mathbf x_i) is generally unknown, we have to estimate it. The same is true for \sigma^2 (for which we already have a consistent estimator, as shown above)

  • A candidate for this purpose is the empirical counter part of E(\mathbf x'_i \mathbf x_i). According to the LLN, a consistent estimator for \mathbf m_{\mathbf {xx}} is:

\hat {\mathbf m}_{\mathbf {xx}} \, = \, \bar {\mathbf m}_{\mathbf {xx}} := \, \left( \dfrac{\mathbf X' \mathbf X}{n} \right), \, \text{ with: } \operatorname{plim} \left( \dfrac{\mathbf X' \mathbf X}{n} \right) \, = \, \operatorname{plim} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right ) \, = \, E(\mathbf x'_i \mathbf x_i) := \ \mathbf m_{\mathbf {xx}}

  • Plugging this result into Equation 4.7 we get a consistent estimate of the asymptotic distribution of \hat{\boldsymbol \beta}

\hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ \hat \sigma^2 (\mathbf X' \mathbf X)^{-1} \right) This proof the first two claims of Theorem 4.2, and trivially the third one.

For the fourth claim we employ the theorem about the distribution of quadratic forms, Theorem A.1, together with the variance formula from Theorem C.1. As all requirements of these theorems are met, this poofs the fourth claim.

q.e.d.


In the proof above we have shown that a consistent estimate of the asymptotic distribution of \hat{\boldsymbol \beta} is:

\hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ \hat \sigma^2 (\mathbf X' \mathbf X)^{-1} \right) \tag{5.9}

  • This is exactly the same as the small sample distribution under MLR.6. Hence, all our test procedures remain the same!

As an example, consider the formula for the variance, respectively standard error of a particular parameter (this is the j^{th} main diagonal element of the estimated asymptotic covariance matrix, see Equation 2.18)

\widehat{\operatorname{Var}(\hat \beta_j)} = \dfrac{\hat\sigma^2}{SST_{j} (1-R_j^2)}

  • \hat\sigma^2 and (1-R_j^2) converge to fixed numbers, and SST_{j} “converge” to \, n \cdot \operatorname{Var}(x_j)

  • So, \widehat{\operatorname{Var}(\hat{\boldsymbol \beta}_j)} shrinks with the rate 1/n, and se(\hat{\boldsymbol \beta}_j) shrinks with the rate 1/\sqrt n

  • This is why large samples are better


If we allow for heteroskedasticity (abandon MLR.5), the proof for asymptotic normality remains valid with some alterations: Instead of MLR.5 we assume: E(u_i^2|\mathbf x_i) = \bar\sigma^2 w_i, with w_i > 0, \ \sum_i w_i = n and finally get:

\hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ (\mathbf X' \mathbf X)^{-1} \, (\widehat{\bar \sigma^2 \mathbf X' \boldsymbol\Omega \mathbf X}) \, (\mathbf X' \mathbf X)^{-1} \right)

  • Calculating the covariance matrix of the last term in Equation 4.5 we instead now have

\operatorname{Var}(\mathbf x_i'u_i) = E( \mathbf x_i' u_i u_i \mathbf x_i ) = E_x [ E(\mathbf x_i' u_i u_i \mathbf x_i | \mathbf x_i) ] = E_x[ \mathbf x_i' E(u_i^2 | \mathbf x_i) \mathbf x_i )] = \bar \sigma^2 E_x[ \mathbf x_i' w_i \mathbf x_i ] = \bar\sigma^2 \mathbf m_{\mathbf {xwx}}

  • This leads to the asymptotic covariance matrix and asymptotic distribution:

\operatorname{Var} \left( \sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) \right) \ = \ \bar \sigma^2 \, \mathbf m_{\mathbf {xx}}^{-1} \, \mathbf m_{\mathbf {xwx}} \, \mathbf m_{\mathbf {xx}}'^{-1} \Rightarrow

\hat{\boldsymbol{\beta}} \ \, \stackrel{a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ \bar \sigma^2 \, \mathbf m_{\mathbf {xx}}^{-1} \, \mathbf m_{\mathbf {xwx}} \, \mathbf m_{\mathbf {xx}}'^{-1} / n\right)

  • Estimating \mathbf m_{\mathbf {xx}} with (\mathbf X' \mathbf X/n) and \bar \sigma^2 \mathbf m_{\mathbf {xwx}} with \widehat{ \bar \sigma^2 \mathbf X' \boldsymbol\Omega \mathbf X/n}), \, \bar \sigma^2 \boldsymbol\Omega being a diagonal matrix with diagonal elements \bar \sigma^2 w_i replaced by \hat u_i^2 (which needs to be shown that this procedure leads to a consistent estimate, White (1980)), we finally get:

\hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ (\mathbf X' \mathbf X)^{-1} \, (\widehat{\bar \sigma^2 \mathbf X' \boldsymbol\Omega \mathbf X}) \, (\mathbf X' \mathbf X)^{-1} \right)


4.3.2 An examples - a Monte-Carlo simulation experiment

In the following we simulate a regression model to demonstrate the concept of asymptotic normality

  • The model is the same we used to investigate consistency. But this time, the error term u_i is not normally distributed but \chi^2(2) distributed (scaled to exhibit a variance of 1) which is an extreme right skewed distribution

  • In the first step, we generate a sample of 15 random draws of u_i and x_i, with Var(\ u_i) = 1, Var(x_i) = 1 and Cov(u_i, x_i) = 0 to calculate y_i. Then we estimate the model with these simulated data by OLS to get \hat\beta_0, \hat\beta_1 and \hat\sigma^2. The procedure is repeated 10000 times and the results for \hat\beta_0, \hat\beta_1 and \hat\sigma^2 are shown as histogram to display the distribution for the estimated parameters

  • The histograms for \hat\beta_0, \hat\beta_1 and \hat\sigma^2 show a pronounced skewed distributions, as well as a too large kurtosis

  • In the second, step we repeat the whole procedure, but this time with a sample length of 250. As one can see the distributions of the estimators are now approximately normally distributed. Although we assumed an extreme skewed distribution of the error term, a quite short sample of 250 (which is a typical sample length of quarterly time series) is sufficient for approximately normally distributed estimators in our experiment

  • Clearly, with a sample size of 30000, the distribution of the estimators collapses to a spike, so the OLS estimators remain consistent


sim(n=15, rep=10000, distr=1, histdistr=1)

=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.499   0.772   0.527       0.145       0.841     0.517  
---------------------------------------------------------
   user  system elapsed 
  3.913   0.409   3.394 

Figure 4.5: Simulation with n=15 and 10000 replications, error term is chi2(2)

sim(n=250, rep=10000, distr=1, histdistr=1)

=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.500   0.065   0.065       0.052       0.997     0.091  
---------------------------------------------------------
   user  system elapsed 
  3.828   0.043   3.880 

Figure 4.6: Simulation with n=250 and 10000 replications, error term is chi2(2)

sim(n=30000, rep=1000, distr=1, histdistr=0)

=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.500   0.006   0.006       0.049       0.999     0.008  
---------------------------------------------------------
   user  system elapsed 
 86.709  28.371  14.675 

Figure 4.7: Simulation with n=3000 and 1000 replications, error term is chi2(2)