4 Asymtotics – Econometrics with R

Code

# Function for Monte-Carlo simulation for regressions with one or two variables

sim <- function(n=100, rep=1000, nvar=1, arcoef=0, sige=1, sigu1=1, sigu2=1, correu1=0, correu2=0, corru1u2=0, autocorr=0, debug=0, distr=0, histdistr=0) {
  
  # start clock
  starttime <- proc.time()
  
  # true parameters
  B0 = 1
  B1 = 0.5
  B2 = 0.5
  
  OLS  <- vector(mode = "list", length = rep) # initialing list for storing results
  SOLS <- vector(mode = "list", length = rep) # initialing list for storing results
  
  ######################### rep loop #################################################
  for (i in (1:rep)) {
    
    if (nvar==1) {
      
      if (distr==0) {
        u1 = rnorm(n+10, mean = 0, sd = 1)
        e =  rnorm(n+10, mean = 0, sd = 1)        
      }
      
      if (distr==1) {
        u1 = rnorm(n+10, mean = 0, sd = 1)
        e =  (rchisq(n+10, 2) - 2)/2
      }
      
      
      
      covmat = matrix( c(sige^2, correu1*sige*sigu1, correu1*sige*sigu1, sigu1^2), ncol = 2)
      M <- t(chol(covmat)) # untere Dreiecksmatrix
      Z = t( M %*% t( cbind(e, u1) ) )  # covmat = M cov(e,u1) M'
    
      x1 = filter(Z[,2], arcoef, method = "recursive")  # AR(1) process
      tsp(x1) <- NULL   # remove ts property, for better plotting
      maxx1 = max(x1)
      minx1 = min(x1)
      
      e = filter(Z[,1], autocorr, method = "recursive")
      tsp(e) <- NULL   # remove ts property, for better plotting
      y = B0 + B1*x1 + e
      maxy = max(y)
      miny = min(y)
      
      y = y[11:n+10]; x1 = x1[11:n+10]; 
      
      OLS[[i]] = lm(y ~ x1, model = FALSE)
      
    }
    
    if (nvar==2) {
      
      if (distr==0) {
        u1 = rnorm(n+10, mean = 0, sd = 1)
        u2 = rnorm(n+10, mean = 0, sd = 1) 
        e =  rnorm(n+10, mean = 0, sd = 1)        
      }
      
      if (distr==1) {
        u1 = rnorm(n+10, mean = 0, sd = 1)
        u2 = rnorm(n+10, mean = 0, sd = 1) 
        e =  (rchisq(n+10, 2) - 2)/2
      }
      

      covmat = matrix( c(sige^2, correu1*sige*sigu1, correu2*sige*sigu2,   correu1*sige*sigu1, sigu1^2, corru1u2*sigu1*sigu2,  correu2*sige*sigu2, corru1u2*sigu1*sigu2, sigu2^2 ), ncol = 3)
      
      M <- t(chol(covmat)) # untere Dreiecksmatrix
      Z = t( M %*% t( cbind(e, u1, u2) ) )  # covmat = M cov(e,u1,u2) M'
      
      x1 = filter(Z[,2], arcoef, method = "recursive")  # AR(1) process
      tsp(x1) <- NULL   # remove ts property, for better ploting
      maxx1 = max(x1)
      minx1 = min(x1)
      
      x2 = filter(Z[,3], arcoef, method = "recursive")
      tsp(x2) <- NULL   # remove ts property, for better ploting
      
      e = filter(Z[,1], autocorr, method = "recursive")
      tsp(e) <- NULL   # remove ts property, for better ploting
      y = B0 + B1*x1 + B2*x2 + e
      maxy = max(y)
      miny = min(y)
      
      y = y[11:n+10]; x1 = x1[11:n+10]; x2 = x2[11:n+10]; 
      
      OLS[[i]] = lm(y ~ x1 + x2, model = FALSE)
    }
    
    SOLS[[i]] = summary( OLS[[i]] )
  }
  ########################## end rep loop ############################################
  
  
  # rep = 1: scatterplot with true and estimated regline
  if (rep==1) {
    plot(y ~ x1, col="blue")
    abline(OLS[[i]], col="blue")
    abline(c(B0,B1), col="red")
  }

  
  # rep > 1 and <= 100: true and up to 100 estimated reglines
  if (rep>1 & rep<=100) {
    plot(NULL, xlim = c(minx1*1.1, maxx1*1.1), ylim = c(miny*1.1, maxy*1.1))
    for (i in 1:rep) abline(OLS[[i]], col="grey")
    abline(c(B0,B1), col="red")
  }
  
  
  # rep > 100: histograms of estimated parameters and sigma
  if (rep > 100) {
    BB = c(B0, B1, B2)
    par(mfrow = c(2, 2))
    
    for (i in (0:nvar)) {
      bb = paste0("b",i)
      eval( parse( text=paste0(bb,"<-", mean(unlist(lapply(OLS, function(x) coef(x)[i+1])))) ) ) 

      if (histdistr==0) {
        hist(unlist(lapply(OLS, function(x) coef(x)[i+1])), freq = FALSE, breaks = 26, xlim = c( eval(parse(text=bb))-0.5, eval(parse(text=bb))+0.5 ), main = paste0("Distribution of b",i), xlab = bb) 
      }
      if (histdistr==1) {
        bdistri <- unlist(lapply(OLS, function(x) coef(x)[i+1]))
        hist(bdistri, freq = FALSE, breaks = 30,  main = paste0("Distribution of b",i), xlab = bb) 
        curve(dnorm(x, mean = mean(bdistri),sd=sd(bdistri)),add = TRUE, col = "blue")
      }
      abline(v=BB[i+1], col = "red")
      
    }
    
    
    
    # b0 =  mean(unlist(lapply(OLS, function(x) coef(x)[1]))) 
    # if (histdistr==0) {
    # hist(unlist(lapply(OLS, function(x) coef(x)[1])), freq = FALSE, breaks = 26, xlim = c(b0-0.5, b0+0.5), main = "Distribution of b0", xlab = "b0") 
    # }
    # if (histdistr==1) {
    # hist(unlist(lapply(OLS, function(x) coef(x)[1])), freq = FALSE, breaks = 30,  main = "Distribution of b0", xlab = "b0") 
    # }
    # abline(v=B0, col = "red")
    # 
    # b1 =  mean(unlist(lapply(OLS, function(x) coef(x)[2]))) 
    # if (histdistr==0) {
    # hist(unlist(lapply(OLS, function(x) coef(x)[2])), freq = FALSE, breaks = 26, xlim = c(b1-0.5 ,b1+0.5), main = "Distribution of b1", xlab = "b1") 
    # }
    # if (histdistr==1) {
    # hist(unlist(lapply(OLS, function(x) coef(x)[2])), freq = FALSE, breaks = 30,  main = "Distribution of b1", xlab = "b1") 
    # }
    # abline(v=B1, col = "red")
    # 
    # if (nvar==2) {
    #   b2 =  mean(unlist(lapply(OLS, function(x) coef(x)[3]))) 
    #   hist(unlist(lapply(OLS, function(x) coef(x)[3])), freq = FALSE, breaks = 26, xlim = c(b2-0.5, b2+0.5), main = "Distribution of b2", xlab = "b2") 
    #   abline(v=B2, col = "red") 
    # }
    
    s =  mean(unlist(lapply(SOLS, function(x) x$sigma))) 
    if (histdistr==0) {
    hist(unlist(lapply(SOLS, function(x) x$sigma)), freq = FALSE, breaks = 26, xlim = c(s-0.5, s+0.5), main = "Distribution of sigma", xlab = "sigma") 
    }
    
    if (histdistr==1) {
    sigdistri <- unlist(lapply(SOLS, function(x) x$sigma))
    hist(sigdistri, freq = FALSE, breaks = 30,  main = "Distribution of sigma", xlab = "sigma") 
    curve(dnorm(x, mean = mean(sigdistri),sd=sd(sigdistri)),add = TRUE, col = "blue")
    }
    abline(v=sqrt(sige^2/(1-autocorr^2)), col = "red")
    
    par(mfrow = c(1, 1))
  }
  
  # result table
  if (rep > 100) {
    results <- data.frame(Mean_b1 = b1)
    results$sd_b1 <- sd(unlist(lapply(OLS, function(x) coef(x)[2]))) 
    results$mean_se_b1 <- mean(unlist(lapply(SOLS, function(x) x$coef[2,2]))) 
    results$size_b1_0.05 <- sum( unlist(lapply(SOLS, function(x) abs(x$coef[2,1]-B1)/x$coef[2,2] )) > 1.96) / rep
    
    if (nvar == 2) {
      results$Mean_b2 <-  mean(unlist(lapply(OLS, function(x) coef(x)[3]))) 
      results$sd_b2 <- sd(unlist(lapply(OLS, function(x) coef(x)[3]))) 
      results$mean_se_b2 <- mean(unlist(lapply(SOLS, function(x) x$coef[3,2]))) 
      results$size_b2_0.05 <- sum( unlist(lapply(SOLS, function(x) abs(x$coef[3,1]-B2)/x$coef[3,2] )) > 1.96) / rep
    }
    results$mean_sigma <- s
    results$sd_sigma <- sd(unlist(lapply(SOLS, function(x) x$sigma))) 
    
    stargazer::stargazer(results, type = "text", summary = FALSE, rownames = FALSE)
  }
  

  if (debug==1) {
    print("covmat")
    print(covmat)
    print("chol")
    print(M)
    print("MM' = covmat")
    print(M %*% t(M))
    print("colMeans")
    print(colMeans(Z))
    print( "new covmat")
    print( cov(Z) )
    print( "new cormat" )
    print( cor(Z) )
  }
  
# Stop the clock
endtime <- proc.time()
print(endtime-starttime)

}

4.1 Motivation

So far we focused on properties of OLS that hold for any sample

Properties of OLS that hold for any sample/sample size (see Section 2.7)
- Expected values/unbiasedness under MLR.1 – MLR.4
- Variance formulas under MLR.1 – MLR.5
- Gauss-Markov Theorem under MLR.1 – MLR.5
- Exact sampling distributions/tests under MLR.1 – MLR.6
Properties of OLS that hold in large samples
- Consistency under MLR.1 – MLR.4’
- Asymptotic normality/tests under MLR.1 – MLR.4’

4.2 Consistency

An estimator \hat\theta_n for a sample size n is consistent for a population (true) parameter \theta if P(|\hat\theta_n - \theta| < \epsilon) \rightarrow 1, \ \text{ for arbitrary } \epsilon > 0 \ \text{ and } \ n \rightarrow \infty
Alternative notations: probability limit of \hat \theta is \theta: \operatorname{plim}(\hat\theta) = \theta
or, \hat \theta converges in probability to \theta \hat \theta \ \stackrel{p}{\longrightarrow} \ \theta
Interpretation
Consistency means that the probability that the estimate is arbitrarily close to the true population value can be made arbitrarily high by increasing the sample size
Loosely speaking that means that with larger sample size the estimate becomes more precise and eventually, the estimate converges to the true parameter value
Consistency is the most important requirement for sensible estimators

4.2.1 Consistency of OLS

Theorem 4.1 (Consistency of \hat {\boldsymbol \beta} and \hat \sigma^2) Under MLR.1 - MLR.4 we have

\operatorname{plim}(\hat{\boldsymbol{\beta}}) = \boldsymbol\beta

Under MLR.1 - MLR.5 we have

\operatorname{plim}({\hat \sigma^2}) = \sigma^2

Important Remark, especially for time series:

Assumption MLR.4, E(u|\mathbf X) = 0 or E(u_i|\mathbf x_i) = 0 plus random sampling can be relaxed to

Assumption MLR.4’: E(u_i|\mathbf x_i) = 0 without random sampling or even more to E(u)=0 and \operatorname{Cov}(\mathbf x_i, u_i)=0.

4.2.2 Proof of \operatorname{plim}(\hat{\boldsymbol{\beta}}) = \boldsymbol\beta

Expand to go through the proof

Our true model (population model or data generating process) is:

\mathbf y = \mathbf X \boldsymbol\beta + \mathbf u \tag{4.1}

This leads to the OLS estimator for \boldsymbol \beta:

\hat{\boldsymbol{\beta}}=(\mathbf{X}' \mathbf{X})^{-1} \mathbf{X}' \mathbf{y} \tag{4.2}

Plugging in Equation 4.1 for y in Equation 4.2 leads to:

\hat{\boldsymbol{\beta}} \ = \ \boldsymbol{\beta}+\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}' \mathbf{u} \tag{4.3}

Dividing and multiplying the last term by n we get:

\hat{\boldsymbol{\beta}} \ = \ \boldsymbol{\beta}+\left(\dfrac{\mathbf{X}' \mathbf{X}}{n} \right)^{-1} \left( \dfrac{\mathbf{X}' \mathbf{u}}{n} \right) \ =

= \ \boldsymbol{\beta}+\left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right )^{-1} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x'_i u_i \right ) \tag{4.4}

Now we can apply the Law of Large Numbers (LLN, Theorem A.2) i.e., empirical moments (averages) converge in probability to the expected values of their summands under quite general conditions
1. The expected value of the first summand is E(\mathbf x_i' \mathbf x_i) := \mathbf m_{xx}, the moment matrix of the variables in \mathbf X (the second moments have to exit, which is a requirement for the LLN)
2. The expected value of the second summand is by MLR.4’: E(\mathbf x'_i u_i) := \mathbf m_{xu} = E_x(E(\mathbf x'_i u_i | \mathbf x_i)) = E_x(\mathbf 0) = \mathbf 0
Hence, we have for Equation 4.4 (by Slutsky’s theorem, see Theorem A.4, especially Equation A.9 and Equation A.10):

\operatorname{plim} \hat{\boldsymbol{\beta}} \ = \ \boldsymbol{\beta} + \operatorname{plim} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right )^{-1} \operatorname{plim} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x'_i u_i \right ) \ =

= \ \boldsymbol{\beta} + \mathbf m_{xx}^{-1} \, \mathbf m_{xu} \ = \ \boldsymbol{\beta} + \mathbf m_{xx}^{-1} \, \mathbf 0 \ = \ \boldsymbol{\beta} \tag{5.5}

q.e.d.

Proof of \operatorname{plim}({\hat \sigma^2}) = \sigma^2

Expand to go through the proof

We have:

\hat\sigma^2 = \frac{1}{n-k-1} \sum_{i=1}^n \hat u_i^2 = \frac{1}{n-k-1} \, \hat{\mathbf u}' \hat{\mathbf u} = \frac{1}{n-k-1} \, \mathbf y' \mathbf {M_x} \mathbf y = \frac{1}{n-k-1} \, \mathbf u' \mathbf {M_x} \mathbf u

The fourth term follows directly from the definition and idempotence of \mathbf M (see the discussion of the FWL theorem, Section 2.5.1 or Equation C.10) and for the last term we have:

\mathbf {M_x} \mathbf y = \mathbf {M_x} (\underbrace{\mathbf X \boldsymbol{\beta}+\mathbf u}_\mathbf y ) = \underbrace{\mathbf {M_x} \mathbf X}_\mathbf 0 \, \boldsymbol{\beta} + \mathbf {M_x} \mathbf u = \mathbf {M_x} \mathbf u

So, we finally need the plim of:

\hat\sigma^2 = \frac{1}{n-k-1} \, \mathbf u' \mathbf {M_x} \mathbf u = \frac{1}{n-k-1} \mathbf u' ( \underbrace{ \mathbf I - \mathbf X(\mathbf X '\mathbf X)^{-1} \mathbf X' }_\mathbf{M_x} ) \mathbf u =

=\frac{n}{n-k-1}\left[\frac{\mathbf{u}^{\prime} \mathbf{u}}{n}-\left(\frac{\mathbf{u}^{\prime} \mathbf{X}}{n}\right)\left(\frac{\mathbf{X} \mathbf{X}}{n}\right)^{-1}\left(\frac{\mathbf{X} \mathbf{u}}{n}\right)\right] \ \ \Rightarrow

\operatorname{plim} \hat\sigma^2 \ =

\underbrace{\operatorname{plim}\left[\frac{n}{n-k-1}\right]}_{1} \left[ \operatorname{plim}\left[\frac{\mathbf{u}^{\prime} \mathbf{u}}{n}\right]-\operatorname{plim}\underbrace{\left[\frac{\mathbf{u}^{\prime} \mathbf{X}}{n}\right]}_{\mathbf {\bar m_{xu}}\rightarrow \mathbf 0} \operatorname{plim}\underbrace{\left[\left(\frac{\mathbf{X} \mathbf{X}}{n}\right)^{-1}\right]}_{\mathbf {\bar m_{xx}^{-1}}\rightarrow \mathbf {m_{xx}^{-1}}} \operatorname{plim}\underbrace{\left[\frac{\mathbf{X}^{\prime} \mathbf{u}}{n}\right]}_{\mathbf {\bar m_{xu}}\rightarrow \mathbf 0} \right ] =

\operatorname{plim} \left[ \frac{\mathbf{u}^{\prime} \mathbf{u}}{n} \right] \ = \ \operatorname{plim} \left[ \frac{1}{n} \sum_{i=1}^{n} u_{i}^{2} \right]

The last term is an average of a sequence of random numbers and, according to the LLN, converges to it’s expected value:

E(u_i^2) = E_x [ E(u_i^2 | \mathbf x_i)] = \sigma^2

In the case of heteroscedasticity we would have: E(u_i^2) = E_x [ E(u_i^2 | \mathbf x_i)] = E_x(\sigma_i^2) = \bar \sigma^2.

q.e.d.

4.2.3 Some examples - a Monte-Carlo simulation experiment

In the following we simulate a regression model to demonstrate the concept of consistency

The model is:

y_i = 1 + 0.5 x_i + u_i

In the first step, we generate a sample of 100 random draws of u_i and x_i, with Var(\ u_i) = 1, Var(x_i) = 1 and Cov(u_i, x_i) = 0 to calculate y_i. Then we estimate the model above with these simulated data by OLS to get \hat\beta_0, \hat\beta_1 and \hat\sigma^2. The procedure is repeated 5000 times and the results for \hat\beta_0, \hat\beta_1 and \hat\sigma^2 are shown as histogram to display the distribution for the estimated parameters
In the second, step we repeat the whole procedure, but this time with a sample length of 30000. As one can see the distributions of the estimators collapse to a spike over the true parameter values, which are marked by the vertical red lines. The estimators are consistent!
In the third and fourth step, the two steps above are repeated, but this time with Cov(u_i, x_i) = 0.3. Hence, E(u_i | \mathbf x_i) \neq 0, which means our assumption MLR.4’ is not met
The histograms of the estimators clearly show, that \hat\beta_1 and \hat\sigma^2 are not consistent any more, although the correlation between u_i and x_i is quite moderate

ret <- sim(n=100,rep=5000,nvar=1)


=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.502   0.108   0.107       0.057       1.000     0.075  
---------------------------------------------------------
   user  system elapsed 
  1.440   0.027   1.480

Figure 4.1: Simulation with n=100 and 5000 replications

ret <- sim(n=30000,rep=1000,nvar=1)


=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.500   0.006   0.006       0.058       1.000     0.004  
---------------------------------------------------------
   user  system elapsed 
  6.141   0.317   6.474

Figure 4.2: Simulation with n=30000 and 1000 replications

ret <- sim(n=100,rep=5000,nvar=1,correu1=0.3)


=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.802   0.104   0.102       0.836       0.950     0.072  
---------------------------------------------------------
   user  system elapsed 
  1.370   0.087   1.458

Figure 4.3: Simulation with n=100 and 5000 replications, E(u|x) != 0

ret <- sim(n=30000,rep=1000,nvar=1,correu1=0.3)


=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.800   0.005   0.006         1         0.954     0.004  
---------------------------------------------------------
   user  system elapsed 
  5.650   0.456   6.109

Figure 4.4: Simulation with n=30000 and 1000 replications, E(u|x) != 0

4.3 Asymptotic normality

For carrying out statistical test we needed LRM.6 - normality of the error term. But this assumption is often questionable, and in some cases even impossible to be met, e.g. if the depended variable is restricted to a certain range
If MLR.6 does not hold, the results of t- or F-tests may be wrong
Fortunately, F- and t-tests still work if the sample size is large enough
Also, OLS estimates are normal in large samples even without MLR.6

Hence, we can state the following theorem:

Theorem 4.2 (Asymptotic normality of \hat {\boldsymbol \beta}) Under assumptions MLR.1 – MLR.5:

\sqrt{n}(\hat {\boldsymbol{\beta}} - \boldsymbol{\beta}) \ \stackrel{d}{\longrightarrow} \ N \left( \mathbf 0, \ \sigma^2 \mathbf m_{\mathbf {xx}}^{-1} \right) And a consistent estimate of the asymptotic distribution of \hat{\boldsymbol \beta} is:

\hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ \hat \sigma^2 (\mathbf X' \mathbf X)^{-1} \right)

In particular, this trivially implies:

\dfrac{(\hat\beta_j - \beta_j)}{ se(\hat\beta_j)} \ \, \stackrel{a}{\sim} \ \, N(0,\, 1)

And furthermore, if E(\mathbf{R} \hat{\boldsymbol{\beta}}) = \mathbf{c}) and \mathbf R has m rows:

W \ = \ (\mathbf{R} \hat{\boldsymbol{\beta}}-\mathbf{c})^{\prime}\left(\sigma^{2} \, \mathbf{R}(\mathbf{X} \mathbf{X})^{-1} \mathbf{R}^{\prime}\right)^{-1}(\mathbf{R} \hat{\boldsymbol{\beta}}-\mathbf{c}) \ \, \stackrel{a}{\sim} \ \, \chi^{2}(m)

4.3.1 Proof of asymptotic normality

Expand to go through the proof

Once more, our starting point is representation Equation 4.4 from the consistency proof, Section 4.2.2

\hat{\boldsymbol{\beta}} - \boldsymbol{\beta} \ = \ \left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right )^{-1} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x'_i u_i \right )

From the consistency proof we know that the whole term converges in probability to \mathbf 0. Therefore, we multiply by \sqrt n, so that the term could converge in distribution to a random vector:

\sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) \ = \ \underbrace{ \left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right)^{-1}}_{\stackrel{p}{\longrightarrow} \ \mathbf {m_{xx}^{-1}}} \underbrace{\left( \frac{1}{\sqrt{n}} \sum_{i=1}^n \mathbf x'_i u_i \right ) }_{\stackrel{d}{\longrightarrow} \ N(0,\, \sigma^2 \mathbf {m_{xx}}) } \tag{4.5}

Under our standard assumptions and according to the LLN, the first term converges in probability to E(\mathbf x_i' \mathbf x_i )^{-1} := \mathbf m_{\mathbf {xx}}^{-1}

The more interesting term of Equation 4.5 is the second one. This term does not converge to \mathbf 0 any more, as now we divide by \sqrt n and not by n, as it was the case in the consistency proof
According to the Central Limit Theorem (CLT, Theorem A.3) the second term, which is a sum of arbitrarily distributed random vectors divided by \sqrt n, converges in distribution under quite similar conditions as we had with the LLN to a normally distributed random vector with expected value

E(\mathbf x_i' u_i) = E_x[\mathbf x_i' E(u_i | \mathbf x_i) ] = \mathbf 0

and because \operatorname{Cov}(\mathbf x_i' u_i, \mathbf x_j' u_j) = 0, random sampling, we can write for the covariance matrix

\operatorname {Var}(\mathbf x_i' u_i) = E( \mathbf x_i' u_i u_i \mathbf x_i ) = E_x [ E(\mathbf x_i' u_i u_i \mathbf x_i | \mathbf x_i) ] = E_x[ \mathbf x_i' E(u_i^2 | \mathbf x_i) \mathbf x_i )] = \sigma^2 E_x[ \mathbf x_i' \mathbf x_i ] = \sigma^2 \mathbf m_{\mathbf {xx}}

Hence, because of the convergence in probability of the first term of Equation 4.5 and the covariance formula from Equation C.7, the covariance matrix of \sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) is:

\operatorname{Var} \left( \sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) \right) \ = \ \sigma^2 \, \mathbf m_{\mathbf {xx}}^{-1} \, \mathbf m_{\mathbf {xx}} \, \mathbf m_{\mathbf {xx}}'^{-1} \ = \ \sigma^2 \, \mathbf m_{\mathbf {xx}}^{-1}

So we have the fundamental result about the limiting distribution:

\sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) \ \stackrel{d}{\longrightarrow} \ N \left(\mathbf 0, \ \sigma^2 \mathbf m_{\mathbf {xx}}^{-1} \right) \tag{4.6}

Or equivalent, about the asymptotic distribution (which is an approximation to the true, but unknown distribution of \hat{\boldsymbol{\beta}} at finite but large samples):

\hat{\boldsymbol{\beta}} \ \, \stackrel{a}{\sim} \, \, N \left(\boldsymbol{\beta}, \ \sigma^2 \mathbf m_{\mathbf {xx}}^{-1} / n\right) \tag{4.7}

As \mathbf m_{\mathbf {xx}}:=E(\mathbf x'_i \mathbf x_i) is generally unknown, we have to estimate it. The same is true for \sigma^2 (for which we already have a consistent estimator, as shown above)
A candidate for this purpose is the empirical counter part of E(\mathbf x'_i \mathbf x_i). According to the LLN, a consistent estimator for \mathbf m_{\mathbf {xx}} is:

\hat {\mathbf m}_{\mathbf {xx}} \, = \, \bar {\mathbf m}_{\mathbf {xx}} := \, \left( \dfrac{\mathbf X' \mathbf X}{n} \right), \, \text{ with: } \operatorname{plim} \left( \dfrac{\mathbf X' \mathbf X}{n} \right) \, = \, \operatorname{plim} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right ) \, = \, E(\mathbf x'_i \mathbf x_i) := \ \mathbf m_{\mathbf {xx}}

Plugging this result into Equation 4.7 we get a consistent estimate of the asymptotic distribution of \hat{\boldsymbol \beta}

\hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ \hat \sigma^2 (\mathbf X' \mathbf X)^{-1} \right) This proof the first two claims of Theorem 4.2, and trivially the third one.

For the fourth claim we employ the theorem about the distribution of quadratic forms, Theorem A.1, together with the variance formula from Theorem C.1. As all requirements of these theorems are met, this poofs the fourth claim.

q.e.d.

In the proof above we have shown that a consistent estimate of the asymptotic distribution of \hat{\boldsymbol \beta} is:

\hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ \hat \sigma^2 (\mathbf X' \mathbf X)^{-1} \right) \tag{5.9}

This is exactly the same as the small sample distribution under MLR.6. Hence, all our test procedures remain the same!

As an example, consider the formula for the variance, respectively standard error of a particular parameter (this is the j^{th} main diagonal element of the estimated asymptotic covariance matrix, see Equation 2.31)

\widehat{\operatorname{Var}(\hat \beta_j)} = \dfrac{\hat\sigma^2}{SST_{j} (1-R_j^2)}

\hat\sigma^2 and (1-R_j^2) converge to fixed numbers, and SST_{j} “converge” to \, n \cdot \operatorname{Var}(x_j)
So, \widehat{\operatorname{Var}(\hat{\boldsymbol \beta}_j)} shrinks with the rate 1/n, and se(\hat{\boldsymbol \beta}_j) shrinks with the rate 1/\sqrt n
This is why large samples are better

If we allow for heteroskedasticity (abandon MLR.5), the proof for asymptotic normality remains valid with some alterations: Instead of MLR.5 we assume: E(u_i^2|\mathbf x_i) = \bar\sigma^2 w_i, with w_i > 0, \ \sum_i w_i = n and finally get:

\hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ (\mathbf X' \mathbf X)^{-1} \, (\widehat{\bar \sigma^2 \mathbf X' \boldsymbol\Omega \mathbf X}) \, (\mathbf X' \mathbf X)^{-1} \right)

Sketch of proof under heteroskedasticity

Calculating the covariance matrix of the last term in Equation 4.5 we instead now have

\operatorname{Var}(\mathbf x_i'u_i) = E( \mathbf x_i' u_i u_i \mathbf x_i ) = E_x [ E(\mathbf x_i' u_i u_i \mathbf x_i | \mathbf x_i) ] = E_x[ \mathbf x_i' E(u_i^2 | \mathbf x_i) \mathbf x_i )] = \bar \sigma^2 E_x[ \mathbf x_i' w_i \mathbf x_i ] = \bar\sigma^2 \mathbf m_{\mathbf {xwx}}

This leads to the asymptotic covariance matrix and asymptotic distribution:

\operatorname{Var} \left( \sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) \right) \ = \ \bar \sigma^2 \, \mathbf m_{\mathbf {xx}}^{-1} \, \mathbf m_{\mathbf {xwx}} \, \mathbf m_{\mathbf {xx}}'^{-1} \Rightarrow

\hat{\boldsymbol{\beta}} \ \, \stackrel{a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ \bar \sigma^2 \, \mathbf m_{\mathbf {xx}}^{-1} \, \mathbf m_{\mathbf {xwx}} \, \mathbf m_{\mathbf {xx}}'^{-1} / n\right)

Estimating \mathbf m_{\mathbf {xx}} with (\mathbf X' \mathbf X/n) and \bar \sigma^2 \mathbf m_{\mathbf {xwx}} with \widehat{ \bar \sigma^2 \mathbf X' \boldsymbol\Omega \mathbf X/n}), \, \bar \sigma^2 \boldsymbol\Omega being a diagonal matrix with diagonal elements \bar \sigma^2 w_i replaced by \hat u_i^2 (which needs to be shown that this procedure leads to a consistent estimate, White (1980)), we finally get:

4.3.2 An examples - a Monte-Carlo simulation experiment

In the following we simulate a regression model to demonstrate the concept of asymptotic normality

The model is the same we used to investigate consistency. But this time, the error term u_i is not normally distributed but \chi^2(2) distributed (scaled to exhibit a variance of 1) which is an extreme right skewed distribution
In the first step, we generate a sample of 15 random draws of u_i and x_i, with Var(\ u_i) = 1, Var(x_i) = 1 and Cov(u_i, x_i) = 0 to calculate y_i. Then we estimate the model with these simulated data by OLS to get \hat\beta_0, \hat\beta_1 and \hat\sigma^2. The procedure is repeated 10000 times and the results for \hat\beta_0, \hat\beta_1 and \hat\sigma^2 are shown as histogram to display the distribution for the estimated parameters
The histograms for \hat\beta_0, \hat\beta_1 and \hat\sigma^2 show a pronounced skewed distributions, as well as a too large kurtosis
In the second, step we repeat the whole procedure, but this time with a sample length of 250. As one can see the distributions of the estimators are now approximately normally distributed. Although we assumed an extreme skewed distribution of the error term, a quite short sample of 250 (which is a typical sample length of quarterly time series) is sufficient for approximately normally distributed estimators in our experiment
Clearly, with a sample size of 30000, the distribution of the estimators collapses to a spike, so the OLS estimators remain consistent

sim(n=15, rep=10000, distr=1, histdistr=1)


=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.515   0.737   0.531       0.142       0.847     0.527  
---------------------------------------------------------
   user  system elapsed 
  2.654   0.110   2.765

Figure 4.5: Simulation with n=15 and 10000 replications, error term is chi²(2)

sim(n=250, rep=10000, distr=1, histdistr=1)


=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.500   0.066   0.065       0.054       0.996     0.091  
---------------------------------------------------------
   user  system elapsed 
  3.199   0.046   3.256

Figure 4.6: Simulation with n=250 and 10000 replications, error term is chi²(2)

sim(n=30000, rep=1000, distr=1, histdistr=0)


=========================================================
Mean_b1 sd_b1 mean_se_b1 size_b1_0.05 mean_sigma sd_sigma
---------------------------------------------------------
0.500   0.006   0.006       0.056       1.000     0.008  
---------------------------------------------------------
   user  system elapsed 
  7.053   0.373   7.429

Figure 4.7: Simulation with n=3000 and 1000 replications, error term is chi²(2)

--- title-block-banner: true subtitle: "Mainly based on @wooldridge_Intro_Econometrics, Chapter 5" --- # Asymtotics \pagebreak ```{r} #| code-fold: true # Function for Monte-Carlo simulation for regressions with one or two variables sim <- function(n=100, rep=1000, nvar=1, arcoef=0, sige=1, sigu1=1, sigu2=1, correu1=0, correu2=0, corru1u2=0, autocorr=0, debug=0, distr=0, histdistr=0) { # start clock starttime <- proc.time() # true parameters B0 = 1 B1 = 0.5 B2 = 0.5 OLS <- vector(mode = "list", length = rep) # initialing list for storing results SOLS <- vector(mode = "list", length = rep) # initialing list for storing results ######################### rep loop ################################################# for (i in (1:rep)) { if (nvar==1) { if (distr==0) { u1 = rnorm(n+10, mean = 0, sd = 1) e = rnorm(n+10, mean = 0, sd = 1) } if (distr==1) { u1 = rnorm(n+10, mean = 0, sd = 1) e = (rchisq(n+10, 2) - 2)/2 } covmat = matrix( c(sige^2, correu1*sige*sigu1, correu1*sige*sigu1, sigu1^2), ncol = 2) M <- t(chol(covmat)) # untere Dreiecksmatrix Z = t( M %*% t( cbind(e, u1) ) ) # covmat = M cov(e,u1) M' x1 = filter(Z[,2], arcoef, method = "recursive") # AR(1) process tsp(x1) <- NULL # remove ts property, for better plotting maxx1 = max(x1) minx1 = min(x1) e = filter(Z[,1], autocorr, method = "recursive") tsp(e) <- NULL # remove ts property, for better plotting y = B0 + B1*x1 + e maxy = max(y) miny = min(y) y = y[11:n+10]; x1 = x1[11:n+10]; OLS[[i]] = lm(y ~ x1, model = FALSE) } if (nvar==2) { if (distr==0) { u1 = rnorm(n+10, mean = 0, sd = 1) u2 = rnorm(n+10, mean = 0, sd = 1) e = rnorm(n+10, mean = 0, sd = 1) } if (distr==1) { u1 = rnorm(n+10, mean = 0, sd = 1) u2 = rnorm(n+10, mean = 0, sd = 1) e = (rchisq(n+10, 2) - 2)/2 } covmat = matrix( c(sige^2, correu1*sige*sigu1, correu2*sige*sigu2, correu1*sige*sigu1, sigu1^2, corru1u2*sigu1*sigu2, correu2*sige*sigu2, corru1u2*sigu1*sigu2, sigu2^2 ), ncol = 3) M <- t(chol(covmat)) # untere Dreiecksmatrix Z = t( M %*% t( cbind(e, u1, u2) ) ) # covmat = M cov(e,u1,u2) M' x1 = filter(Z[,2], arcoef, method = "recursive") # AR(1) process tsp(x1) <- NULL # remove ts property, for better ploting maxx1 = max(x1) minx1 = min(x1) x2 = filter(Z[,3], arcoef, method = "recursive") tsp(x2) <- NULL # remove ts property, for better ploting e = filter(Z[,1], autocorr, method = "recursive") tsp(e) <- NULL # remove ts property, for better ploting y = B0 + B1*x1 + B2*x2 + e maxy = max(y) miny = min(y) y = y[11:n+10]; x1 = x1[11:n+10]; x2 = x2[11:n+10]; OLS[[i]] = lm(y ~ x1 + x2, model = FALSE) } SOLS[[i]] = summary( OLS[[i]] ) } ########################## end rep loop ############################################ # rep = 1: scatterplot with true and estimated regline if (rep==1) { plot(y ~ x1, col="blue") abline(OLS[[i]], col="blue") abline(c(B0,B1), col="red") } # rep > 1 and <= 100: true and up to 100 estimated reglines if (rep>1 & rep<=100) { plot(NULL, xlim = c(minx1*1.1, maxx1*1.1), ylim = c(miny*1.1, maxy*1.1)) for (i in 1:rep) abline(OLS[[i]], col="grey") abline(c(B0,B1), col="red") } # rep > 100: histograms of estimated parameters and sigma if (rep > 100) { BB = c(B0, B1, B2) par(mfrow = c(2, 2)) for (i in (0:nvar)) { bb = paste0("b",i) eval( parse( text=paste0(bb,"<-", mean(unlist(lapply(OLS, function(x) coef(x)[i+1])))) ) ) if (histdistr==0) { hist(unlist(lapply(OLS, function(x) coef(x)[i+1])), freq = FALSE, breaks = 26, xlim = c( eval(parse(text=bb))-0.5, eval(parse(text=bb))+0.5 ), main = paste0("Distribution of b",i), xlab = bb) } if (histdistr==1) { bdistri <- unlist(lapply(OLS, function(x) coef(x)[i+1])) hist(bdistri, freq = FALSE, breaks = 30, main = paste0("Distribution of b",i), xlab = bb) curve(dnorm(x, mean = mean(bdistri),sd=sd(bdistri)),add = TRUE, col = "blue") } abline(v=BB[i+1], col = "red") } # b0 = mean(unlist(lapply(OLS, function(x) coef(x)[1]))) # if (histdistr==0) { # hist(unlist(lapply(OLS, function(x) coef(x)[1])), freq = FALSE, breaks = 26, xlim = c(b0-0.5, b0+0.5), main = "Distribution of b0", xlab = "b0") # } # if (histdistr==1) { # hist(unlist(lapply(OLS, function(x) coef(x)[1])), freq = FALSE, breaks = 30, main = "Distribution of b0", xlab = "b0") # } # abline(v=B0, col = "red") # # b1 = mean(unlist(lapply(OLS, function(x) coef(x)[2]))) # if (histdistr==0) { # hist(unlist(lapply(OLS, function(x) coef(x)[2])), freq = FALSE, breaks = 26, xlim = c(b1-0.5 ,b1+0.5), main = "Distribution of b1", xlab = "b1") # } # if (histdistr==1) { # hist(unlist(lapply(OLS, function(x) coef(x)[2])), freq = FALSE, breaks = 30, main = "Distribution of b1", xlab = "b1") # } # abline(v=B1, col = "red") # # if (nvar==2) { # b2 = mean(unlist(lapply(OLS, function(x) coef(x)[3]))) # hist(unlist(lapply(OLS, function(x) coef(x)[3])), freq = FALSE, breaks = 26, xlim = c(b2-0.5, b2+0.5), main = "Distribution of b2", xlab = "b2") # abline(v=B2, col = "red") # } s = mean(unlist(lapply(SOLS, function(x) x$sigma))) if (histdistr==0) { hist(unlist(lapply(SOLS, function(x) x$sigma)), freq = FALSE, breaks = 26, xlim = c(s-0.5, s+0.5), main = "Distribution of sigma", xlab = "sigma") } if (histdistr==1) { sigdistri <- unlist(lapply(SOLS, function(x) x$sigma)) hist(sigdistri, freq = FALSE, breaks = 30, main = "Distribution of sigma", xlab = "sigma") curve(dnorm(x, mean = mean(sigdistri),sd=sd(sigdistri)),add = TRUE, col = "blue") } abline(v=sqrt(sige^2/(1-autocorr^2)), col = "red") par(mfrow = c(1, 1)) } # result table if (rep > 100) { results <- data.frame(Mean_b1 = b1) results$sd_b1 <- sd(unlist(lapply(OLS, function(x) coef(x)[2]))) results$mean_se_b1 <- mean(unlist(lapply(SOLS, function(x) x$coef[2,2]))) results$size_b1_0.05 <- sum( unlist(lapply(SOLS, function(x) abs(x$coef[2,1]-B1)/x$coef[2,2] )) > 1.96) / rep if (nvar == 2) { results$Mean_b2 <- mean(unlist(lapply(OLS, function(x) coef(x)[3]))) results$sd_b2 <- sd(unlist(lapply(OLS, function(x) coef(x)[3]))) results$mean_se_b2 <- mean(unlist(lapply(SOLS, function(x) x$coef[3,2]))) results$size_b2_0.05 <- sum( unlist(lapply(SOLS, function(x) abs(x$coef[3,1]-B2)/x$coef[3,2] )) > 1.96) / rep } results$mean_sigma <- s results$sd_sigma <- sd(unlist(lapply(SOLS, function(x) x$sigma))) stargazer::stargazer(results, type = "text", summary = FALSE, rownames = FALSE) } if (debug==1) { print("covmat") print(covmat) print("chol") print(M) print("MM' = covmat") print(M %*% t(M)) print("colMeans") print(colMeans(Z)) print( "new covmat") print( cov(Z) ) print( "new cormat" ) print( cor(Z) ) } # Stop the clock endtime <- proc.time() print(endtime-starttime) } ``` ## Motivation ##### So far we focused on properties of OLS that hold for any sample - Properties of OLS that hold for **any** sample/sample size (see @sec-MLR) - Expected values/unbiasedness under MLR.1 – MLR.4 - Variance formulas under MLR.1 – MLR.5 - Gauss-Markov Theorem under MLR.1 – MLR.5 - Exact sampling distributions/tests under MLR.1 – MLR.6 - Properties of OLS that hold in *large* samples - **Consistency** under MLR.1 – MLR.4' - **Asymptotic** normality/tests under MLR.1 – MLR.4' ------------------------------------------------------------------------ ## Consistency - An estimator $\hat\theta_n$ for a sample size $n$ is *consistent* for a population (true) parameter $\theta$ if $$P(|\hat\theta_n - \theta| < \epsilon) \rightarrow 1, \ \text{ for arbitrary } \epsilon > 0 \ \text{ and } \ n \rightarrow \infty $$ - Alternative notations: *probability limit* of $\hat \theta$ is $\theta$: $$ \operatorname{plim}(\hat\theta) = \theta $$ - or, $\hat \theta$ *converges in probability* to $\theta$ $$ \hat \theta \ \stackrel{p}{\longrightarrow} \ \theta $$ - Interpretation - Consistency means that the probability that the estimate is arbitrarily close to the true population value can be made arbitrarily high by increasing the sample size - Loosely speaking that means that with larger sample size the estimate becomes more precise and eventually, the estimate converges to the true parameter value - Consistency is the most important requirement for sensible estimators ------------------------------------------------------------------------ ### Consistency of OLS ::: {#thm-consistent_beta_sigma} ## Consistency of $\hat {\boldsymbol \beta}$ and $\hat \sigma^2$ Under MLR.1 - MLR.4 we have $$ \operatorname{plim}(\hat{\boldsymbol{\beta}}) = \boldsymbol\beta $$ Under MLR.1 - MLR.5 we have $$ \operatorname{plim}({\hat \sigma^2}) = \sigma^2 $$ ::: ##### **Important Remark, especially for time series:** Assumption MLR.4, $E(u|\mathbf X) = 0$ or $E(u_i|\mathbf x_i) = 0$ *plus* random sampling can be relaxed to Assumption MLR.4': $E(u_i|\mathbf x_i) = 0$ *without* random sampling or even more to $E(u)=0$ and $\operatorname{Cov}(\mathbf x_i, u_i)=0$. ------------------------------------------------------------------------ ### Proof of $\operatorname{plim}(\hat{\boldsymbol{\beta}}) = \boldsymbol\beta$ {#sec-cosistProof} ::: {.callout-tip collapse="true" icon="false"} ## Expand to go through the proof - Our true model (population model or data generating process) is: $$ \mathbf y = \mathbf X \boldsymbol\beta + \mathbf u $$ {#eq-baseModelConsit} - This leads to the OLS estimator for $\boldsymbol \beta$: $$ \hat{\boldsymbol{\beta}}=(\mathbf{X}' \mathbf{X})^{-1} \mathbf{X}' \mathbf{y} $$ {#eq-betahatConsit} - Plugging in @eq-baseModelConsit for **y** in @eq-betahatConsit leads to: $$ \hat{\boldsymbol{\beta}} \ = \ \boldsymbol{\beta}+\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}' \mathbf{u} $$ {#eq-betahatConsitAltern} - Dividing and multiplying the last term by *n* we get: $$ \hat{\boldsymbol{\beta}} \ = \ \boldsymbol{\beta}+\left(\dfrac{\mathbf{X}' \mathbf{X}}{n} \right)^{-1} \left( \dfrac{\mathbf{X}' \mathbf{u}}{n} \right) \ = $$ $$ = \ \boldsymbol{\beta}+\left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right )^{-1} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x'_i u_i \right ) $$ {#eq-betahatConsitAltern1} - Now we can apply the *Law of Large Numbers* (LLN, @thm-LLN) i.e., empirical moments (averages) converge in probability to the expected values of their summands under quite general conditions 1) The expected value of the first summand is $E(\mathbf x_i' \mathbf x_i) := \mathbf m_{xx}$, the moment matrix of the variables in $\mathbf X$ (the second moments have to exit, which is a requirement for the LLN) 2) The expected value of the second summand is by MLR.4': $$ E(\mathbf x'_i u_i) := \mathbf m_{xu} = E_x(E(\mathbf x'_i u_i | \mathbf x_i)) = E_x(\mathbf 0) = \mathbf 0$$ - Hence, we have for @eq-betahatConsitAltern1 (by Slutsky’s theorem, see @thm-plim, especially @eq-A_slutz_mult and @eq-A_slutz_inv): $$ \operatorname{plim} \hat{\boldsymbol{\beta}} \ = \ \boldsymbol{\beta} + \operatorname{plim} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right )^{-1} \operatorname{plim} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x'_i u_i \right ) \ = $$ $$ = \ \boldsymbol{\beta} + \mathbf m_{xx}^{-1} \, \mathbf m_{xu} \ = \ \boldsymbol{\beta} + \mathbf m_{xx}^{-1} \, \mathbf 0 \ = \ \boldsymbol{\beta} \tag{5.5} $$ q.e.d. ::: ------------------------------------------------------------------------ #### Proof of $\operatorname{plim}({\hat \sigma^2}) = \sigma^2$ ::: {.callout-tip collapse="true" icon="false"} ## Expand to go through the proof We have: $$ \hat\sigma^2 = \frac{1}{n-k-1} \sum_{i=1}^n \hat u_i^2 = \frac{1}{n-k-1} \, \hat{\mathbf u}' \hat{\mathbf u} = \frac{1}{n-k-1} \, \mathbf y' \mathbf {M_x} \mathbf y = \frac{1}{n-k-1} \, \mathbf u' \mathbf {M_x} \mathbf u $$ The fourth term follows directly from the definition and *idempotence* of $\mathbf M$ (see the discussion of the FWL theorem, @sec-FWL_Mat or @eq-C_MMat) and for the last term we have: $$ \mathbf {M_x} \mathbf y = \mathbf {M_x} (\underbrace{\mathbf X \boldsymbol{\beta}+\mathbf u}_\mathbf y ) = \underbrace{\mathbf {M_x} \mathbf X}_\mathbf 0 \, \boldsymbol{\beta} + \mathbf {M_x} \mathbf u = \mathbf {M_x} \mathbf u $$ So, we finally need the plim of: $$ \hat\sigma^2 = \frac{1}{n-k-1} \, \mathbf u' \mathbf {M_x} \mathbf u = \frac{1}{n-k-1} \mathbf u' ( \underbrace{ \mathbf I - \mathbf X(\mathbf X '\mathbf X)^{-1} \mathbf X' }_\mathbf{M_x} ) \mathbf u = $$ $$ =\frac{n}{n-k-1}\left[\frac{\mathbf{u}^{\prime} \mathbf{u}}{n}-\left(\frac{\mathbf{u}^{\prime} \mathbf{X}}{n}\right)\left(\frac{\mathbf{X} \mathbf{X}}{n}\right)^{-1}\left(\frac{\mathbf{X} \mathbf{u}}{n}\right)\right] \ \ \Rightarrow $$ $\operatorname{plim} \hat\sigma^2 \ =$ $$ \underbrace{\operatorname{plim}\left[\frac{n}{n-k-1}\right]}_{1} \left[ \operatorname{plim}\left[\frac{\mathbf{u}^{\prime} \mathbf{u}}{n}\right]-\operatorname{plim}\underbrace{\left[\frac{\mathbf{u}^{\prime} \mathbf{X}}{n}\right]}_{\mathbf {\bar m_{xu}}\rightarrow \mathbf 0} \operatorname{plim}\underbrace{\left[\left(\frac{\mathbf{X} \mathbf{X}}{n}\right)^{-1}\right]}_{\mathbf {\bar m_{xx}^{-1}}\rightarrow \mathbf {m_{xx}^{-1}}} \operatorname{plim}\underbrace{\left[\frac{\mathbf{X}^{\prime} \mathbf{u}}{n}\right]}_{\mathbf {\bar m_{xu}}\rightarrow \mathbf 0} \right ] = $$ $$ \operatorname{plim} \left[ \frac{\mathbf{u}^{\prime} \mathbf{u}}{n} \right] \ = \ \operatorname{plim} \left[ \frac{1}{n} \sum_{i=1}^{n} u_{i}^{2} \right] $$ The last term is an average of a sequence of random numbers and, according to the LLN, converges to it's expected value: $$ E(u_i^2) = E_x [ E(u_i^2 | \mathbf x_i)] = \sigma^2 $$ In the case of heteroscedasticity we would have: $E(u_i^2) = E_x [ E(u_i^2 | \mathbf x_i)] = E_x(\sigma_i^2) = \bar \sigma^2$. q.e.d. ::: ------------------------------------------------------------------------ ### Some examples - a Monte-Carlo simulation experiment In the following we simulate a regression model to demonstrate the concept of *consistency* - The model is: $$ y_i = 1 + 0.5 x_i + u_i $$ - In the first step, we generate a sample of 100 random draws of $u_i$ and $x_i$, with Var($\ u_i$) = 1, Var($x_i$) = 1 and Cov($u_i, x_i$) = 0 to calculate $y_i$. Then we estimate the model above with these simulated data by OLS to get $\hat\beta_0$, $\hat\beta_1$ and $\hat\sigma^2$. The procedure is repeated 5000 times and the results for $\hat\beta_0$, $\hat\beta_1$ and $\hat\sigma^2$ are shown as histogram to display the distribution for the estimated parameters - In the second, step we repeat the whole procedure, but this time with a sample length of 30000. As one can see the distributions of the estimators collapse to a spike over the true parameter values, which are marked by the vertical red lines. The estimators are consistent! - In the third and fourth step, the two steps above are repeated, but this time with Cov($u_i, x_i$) = 0.3. Hence, $E(u_i | \mathbf x_i) \neq 0$, which means our assumption MLR.4' is not met - The histograms of the estimators clearly show, that $\hat\beta_1$ and $\hat\sigma^2$ are *not consistent* any more, although the correlation between $u_i$ and $x_i$ is quite moderate ------------------------------------------------------------------------ ```{r, message=F} #| label: fig-sim1 #| fig-width: 10 #| fig-asp: 0.75 #| fig-cap: "Simulation with n=100 and 5000 replications" ret <- sim(n=100,rep=5000,nvar=1) ``` ------------------------------------------------------------------------ ```{r, message=F} #| label: fig-sim2 #| fig-width: 10 #| fig-asp: 0.75 #| fig-cap: "Simulation with n=30000 and 1000 replications" ret <- sim(n=30000,rep=1000,nvar=1) ``` ------------------------------------------------------------------------ ```{r, message=F} #| label: fig-sim3 #| fig-width: 10 #| fig-asp: 0.75 #| fig-cap: "Simulation with n=100 and 5000 replications, E(u|x) != 0" ret <- sim(n=100,rep=5000,nvar=1,correu1=0.3) ``` ------------------------------------------------------------------------ ```{r, message=F} #| label: fig-sim4 #| fig-width: 10 #| fig-asp: 0.75 #| fig-cap: "Simulation with n=30000 and 1000 replications, E(u|x) != 0" ret <- sim(n=30000,rep=1000,nvar=1,correu1=0.3) ``` ------------------------------------------------------------------------ ## Asymptotic normality - For carrying out statistical test we needed LRM.6 - normality of the error term. But this assumption is often questionable, and in some cases even impossible to be met, e.g. if the depended variable is restricted to a certain range - If MLR.6 does not hold, the results of *t*- or *F*-tests may be wrong - Fortunately, *F*- and *t*-tests still work if the sample size is large enough - Also, OLS estimates are normal in large samples even without MLR.6 Hence, we can state the following theorem: ::: {#thm-asymptNormal} ## Asymptotic normality of $\hat {\boldsymbol \beta}$ Under assumptions MLR.1 – MLR.5: $$ \sqrt{n}(\hat {\boldsymbol{\beta}} - \boldsymbol{\beta}) \ \stackrel{d}{\longrightarrow} \ N \left( \mathbf 0, \ \sigma^2 \mathbf m_{\mathbf {xx}}^{-1} \right) $$ And a consistent *estimate* of the asymptotic distribution of $\hat{\boldsymbol \beta}$ is: $$ \hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ \hat \sigma^2 (\mathbf X' \mathbf X)^{-1} \right) $$ In particular, this trivially implies: $$ \dfrac{(\hat\beta_j - \beta_j)}{ se(\hat\beta_j)} \ \, \stackrel{a}{\sim} \ \, N(0,\, 1) $$ And furthermore, if $E(\mathbf{R} \hat{\boldsymbol{\beta}}) = \mathbf{c})$ and $\mathbf R$ has *m* rows: $$ W \ = \ (\mathbf{R} \hat{\boldsymbol{\beta}}-\mathbf{c})^{\prime}\left(\sigma^{2} \, \mathbf{R}(\mathbf{X} \mathbf{X})^{-1} \mathbf{R}^{\prime}\right)^{-1}(\mathbf{R} \hat{\boldsymbol{\beta}}-\mathbf{c}) \ \, \stackrel{a}{\sim} \ \, \chi^{2}(m) $$ ::: ------------------------------------------------------------------------ ### Proof of asymptotic normality {#sec-assympt_normality} ::: {.callout-tip collapse="true" icon="false"} ## Expand to go through the proof Once more, our starting point is representation @eq-betahatConsitAltern1 from the consistency proof, @sec-cosistProof $$ \hat{\boldsymbol{\beta}} - \boldsymbol{\beta} \ = \ \left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right )^{-1} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x'_i u_i \right ) $$ - From the consistency proof we know that the whole term *converges in probability* to $\mathbf 0$. Therefore, we multiply by $\sqrt n$, so that the term could *converge in distribution* to a random vector: $$ \sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) \ = \ \underbrace{ \left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right)^{-1}}_{\stackrel{p}{\longrightarrow} \ \mathbf {m_{xx}^{-1}}} \underbrace{\left( \frac{1}{\sqrt{n}} \sum_{i=1}^n \mathbf x'_i u_i \right ) }_{\stackrel{d}{\longrightarrow} \ N(0,\, \sigma^2 \mathbf {m_{xx}}) } $$ {#eq-asyNormDistri} - Under our standard assumptions and according to the LLN, the *first* term *converges in probability* to $$E(\mathbf x_i' \mathbf x_i )^{-1} := \mathbf m_{\mathbf {xx}}^{-1}$$ ------------------------------------------------------------------------ - The more interesting term of @eq-asyNormDistri is the *second* one. This term does not converge to $\mathbf 0$ any more, as now we divide by $\sqrt n$ and not by $n$, as it was the case in the consistency proof - According to the *Central Limit Theorem* (CLT, @thm-CLT) the second term, which is a sum of *arbitrarily distributed* random vectors divided by $\sqrt n$, *converges in distribution* under quite similar conditions as we had with the LLN to a *normally distributed* random vector with *expected value* $$ E(\mathbf x_i' u_i) = E_x[\mathbf x_i' E(u_i | \mathbf x_i) ] = \mathbf 0 $$ - and because $\operatorname{Cov}(\mathbf x_i' u_i, \mathbf x_j' u_j) = 0$, random sampling, we can write for the *covariance matrix* $$ \operatorname {Var}(\mathbf x_i' u_i) = E( \mathbf x_i' u_i u_i \mathbf x_i ) = E_x [ E(\mathbf x_i' u_i u_i \mathbf x_i | \mathbf x_i) ] = E_x[ \mathbf x_i' E(u_i^2 | \mathbf x_i) \mathbf x_i )] = \sigma^2 E_x[ \mathbf x_i' \mathbf x_i ] = \sigma^2 \mathbf m_{\mathbf {xx}} $$ - Hence, because of the convergence in probability of the first term of @eq-asyNormDistri and the covariance formula from @eq-C_covmatrix_lin_combination, the covariance matrix of $\sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta})$ is: $$ \operatorname{Var} \left( \sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) \right) \ = \ \sigma^2 \, \mathbf m_{\mathbf {xx}}^{-1} \, \mathbf m_{\mathbf {xx}} \, \mathbf m_{\mathbf {xx}}'^{-1} \ = \ \sigma^2 \, \mathbf m_{\mathbf {xx}}^{-1} $$ ------------------------------------------------------------------------ So we have the fundamental result about the **limiting distribution**: $$ \sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) \ \stackrel{d}{\longrightarrow} \ N \left(\mathbf 0, \ \sigma^2 \mathbf m_{\mathbf {xx}}^{-1} \right) $$ {#eq-beta_limit_distribution} Or equivalent, about the **asymptotic distribution** (which is an approximation to the true, but unknown distribution of $\hat{\boldsymbol{\beta}}$ *at finite but large samples*): $$ \hat{\boldsymbol{\beta}} \ \, \stackrel{a}{\sim} \, \, N \left(\boldsymbol{\beta}, \ \sigma^2 \mathbf m_{\mathbf {xx}}^{-1} / n\right) $$ {#eq-beta_asymt_distribution} - As $\mathbf m_{\mathbf {xx}}:=E(\mathbf x'_i \mathbf x_i)$ is generally unknown, we have to estimate it. The same is true for $\sigma^2$ (for which we already have a consistent estimator, as shown above) - A candidate for this purpose is the empirical counter part of $E(\mathbf x'_i \mathbf x_i)$. According to the LLN, a *consistent estimator for* $\mathbf m_{\mathbf {xx}}$ is: $$ \hat {\mathbf m}_{\mathbf {xx}} \, = \, \bar {\mathbf m}_{\mathbf {xx}} := \, \left( \dfrac{\mathbf X' \mathbf X}{n} \right), \, \text{ with: } \operatorname{plim} \left( \dfrac{\mathbf X' \mathbf X}{n} \right) \, = \, \operatorname{plim} \left( \frac{1}{n} \sum_{i=1}^n \mathbf x_i' \mathbf x_i \right ) \, = \, E(\mathbf x'_i \mathbf x_i) := \ \mathbf m_{\mathbf {xx}} $$ - Plugging this result into @eq-beta_asymt_distribution we get a consistent *estimate* of the asymptotic distribution of $\hat{\boldsymbol \beta}$ $$ \hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ \hat \sigma^2 (\mathbf X' \mathbf X)^{-1} \right) $$ This proof the first two claims of @thm-asymptNormal, and trivially the third one. For the fourth claim we employ the theorem about the distribution of quadratic forms, @thm-plim_quad, together with the variance formula from @thm-ASigmaA. As all requirements of these theorems are met, this poofs the fourth claim. q.e.d. ::: ------------------------------------------------------------------------ In the proof above we have shown that a consistent *estimate* of the asymptotic distribution of $\hat{\boldsymbol \beta}$ is: $$ \hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ \hat \sigma^2 (\mathbf X' \mathbf X)^{-1} \right) \tag{5.9} $$ - This is exactly the same as the small sample distribution under MLR.6. *Hence, all our test procedures remain the same*! As an *example*, consider the formula for the variance, respectively standard error of a particular parameter (this is the $j^{th}$ main diagonal element of the estimated asymptotic covariance matrix, see @eq-covMat_betahat) $$ \widehat{\operatorname{Var}(\hat \beta_j)} = \dfrac{\hat\sigma^2}{SST_{j} (1-R_j^2)} $$ - $\hat\sigma^2$ and $(1-R_j^2)$ converge to fixed numbers, and $SST_{j}$ "converge" to $\, n \cdot \operatorname{Var}(x_j)$ - So, $\widehat{\operatorname{Var}(\hat{\boldsymbol \beta}_j)}$ shrinks with the rate $1/n$, and $se(\hat{\boldsymbol \beta}_j)$ shrinks with the rate $1/\sqrt n$ - *This is why large samples are better* ------------------------------------------------------------------------ If we allow for *heteroskedasticity* (abandon MLR.5), the proof for asymptotic normality remains valid with some alterations: Instead of MLR.5 we assume: $E(u_i^2|\mathbf x_i) = \bar\sigma^2 w_i$, with $w_i > 0, \ \sum_i w_i = n$ and finally get: $$ \hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ (\mathbf X' \mathbf X)^{-1} \, (\widehat{\bar \sigma^2 \mathbf X' \boldsymbol\Omega \mathbf X}) \, (\mathbf X' \mathbf X)^{-1} \right) $$ ::: {.callout-tip collapse="true" icon="false"} ## Sketch of proof under heteroskedasticity - Calculating the covariance matrix of the last term in @eq-asyNormDistri we instead now have $$ \operatorname{Var}(\mathbf x_i'u_i) = E( \mathbf x_i' u_i u_i \mathbf x_i ) = E_x [ E(\mathbf x_i' u_i u_i \mathbf x_i | \mathbf x_i) ] = E_x[ \mathbf x_i' E(u_i^2 | \mathbf x_i) \mathbf x_i )] = \bar \sigma^2 E_x[ \mathbf x_i' w_i \mathbf x_i ] = \bar\sigma^2 \mathbf m_{\mathbf {xwx}} $$ - This leads to the asymptotic covariance matrix and asymptotic distribution: $$ \operatorname{Var} \left( \sqrt{n}(\hat{\boldsymbol{\beta}} - \boldsymbol{\beta}) \right) \ = \ \bar \sigma^2 \, \mathbf m_{\mathbf {xx}}^{-1} \, \mathbf m_{\mathbf {xwx}} \, \mathbf m_{\mathbf {xx}}'^{-1} \Rightarrow $$ $$ \hat{\boldsymbol{\beta}} \ \, \stackrel{a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ \bar \sigma^2 \, \mathbf m_{\mathbf {xx}}^{-1} \, \mathbf m_{\mathbf {xwx}} \, \mathbf m_{\mathbf {xx}}'^{-1} / n\right) $$ - Estimating $\mathbf m_{\mathbf {xx}}$ with $(\mathbf X' \mathbf X/n)$ and $\bar \sigma^2 \mathbf m_{\mathbf {xwx}}$ with $\widehat{ \bar \sigma^2 \mathbf X' \boldsymbol\Omega \mathbf X/n})$, $\, \bar \sigma^2 \boldsymbol\Omega$ being a diagonal matrix with diagonal elements $\bar \sigma^2 w_i$ replaced by $\hat u_i^2$ (which needs to be shown that this procedure leads to a consistent estimate, @white1980heteroskedasticity), we finally get: $$ \hat{\boldsymbol \beta} \ \, \stackrel{\hat a}{\sim} \ \, N \left(\boldsymbol{\beta}, \ (\mathbf X' \mathbf X)^{-1} \, (\widehat{\bar \sigma^2 \mathbf X' \boldsymbol\Omega \mathbf X}) \, (\mathbf X' \mathbf X)^{-1} \right) $$ ::: ------------------------------------------------------------------------ ### An examples - a Monte-Carlo simulation experiment In the following we simulate a regression model to demonstrate the concept of *asymptotic normality* - The model is the same we used to investigate consistency. But this time, the error term $u_i$ is not normally distributed but $\chi^2(2)$ distributed (scaled to exhibit a variance of 1) which is an extreme right skewed distribution - In the first step, we generate a sample of 15 random draws of $u_i$ and $x_i$, with Var($\ u_i$) = 1, Var($x_i$) = 1 and Cov($u_i, x_i$) = 0 to calculate $y_i$. Then we estimate the model with these simulated data by OLS to get $\hat\beta_0$, $\hat\beta_1$ and $\hat\sigma^2$. The procedure is repeated 10000 times and the results for $\hat\beta_0$, $\hat\beta_1$ and $\hat\sigma^2$ are shown as histogram to display the distribution for the estimated parameters - The histograms for $\hat\beta_0$, $\hat\beta_1$ and $\hat\sigma^2$ show a pronounced skewed distributions, as well as a too large kurtosis - In the second, step we repeat the whole procedure, but this time with a sample length of 250. As one can see the distributions of the estimators are now approximately normally distributed. Although we assumed an extreme skewed distribution of the error term, a quite short sample of 250 (which is a typical sample length of quarterly time series) is sufficient for approximately normally distributed estimators in our experiment - Clearly, with a sample size of 30000, the distribution of the estimators collapses to a spike, so the OLS estimators remain consistent ------------------------------------------------------------------------ ```{r} #| label: fig-sim5 #| fig-width: 10 #| fig-asp: 0.75 #| fig-cap: "Simulation with n=15 and 10000 replications, error term is chi^2^(2)" sim(n=15, rep=10000, distr=1, histdistr=1) ``` ------------------------------------------------------------------------ ```{r} #| label: fig-sim6 #| fig-width: 10 #| fig-asp: 0.75 #| fig-cap: "Simulation with n=250 and 10000 replications, error term is chi^2^(2)" sim(n=250, rep=10000, distr=1, histdistr=1) ``` ------------------------------------------------------------------------ ```{r} #| label: fig-sim7 #| fig-width: 10 #| fig-asp: 0.75 #| fig-cap: "Simulation with n=3000 and 1000 replications, error term is chi^2^(2)" sim(n=30000, rep=1000, distr=1, histdistr=0) ```