Faster Two-Way Fixed Effects Estimator in R
21 Nov 2014
In a current project, we need to run linear models on a large data frame (~70 million rows) with fixed effects for both place (e.g., grid-cell) and time (e.g., year) – a common specification for difference-in-differences models with multiple periods and groups. The base `lm`

package in `R`

choked on the task. The excellent `lfe`

package by Gaure can estimate the models but requires long run times.

In attempt to speed up our estimation routines, I built a new function that uses the `data.table`

and `RcppEigen`

packages to partial out the fixed effects and then run OLS on the de-meaned data. I include the code below in hopes that it might be useful to others facing a similar task. Note: this assumes that your panel is balanced.

To start, we need to create some fake data for analysis:

T = 6 ; N = 10 ^ 4 fake_dt <- data.table ( A = sample ( c ( 0 , 1 ), N * T , replace = TRUE ), B 1 = sample ( c ( 0 , 1 ), N * T , replace = TRUE ), B 2 = rnorm ( N ), year = rep ( 1980 : ( 1979 + T ), N ), cell = rep ( 1 : 100 , each = T ))

TwoWayFE <- function ( y , Xs , fe1 , fe2 , dt , cluster = FALSE ) { pkgs <- c ( "data.table" , "RcppEigen" ) sapply ( pkgs , require , character.only = TRUE ) keep_cols <- c ( y , Xs , fe1 , fe2 ) model_dt <- dt [, keep_cols , with = FALSE ]; rm ( keep_cols ) model_dt <- na.omit ( model_dt ) num_Xs <- length ( Xs ) new_names <- c ( "y" , paste0 ( "x" , 1 : num_Xs ), "fe1" , "fe2" ) setnames ( model_dt , 1 : ncol ( model_dt ), new_names ) # Sample Means: cols <- new_names [ ! grepl ( "fe" , new_names )] model_dt [, paste0 ( "mean_" , cols ) := lapply ( .SD , mean , na.rm = TRUE ), .SDcols = cols ] # Means by FE1: setkey ( model_dt , fe1 ) model_dt [, paste0 ( "mean_" , cols , "_fe1" ) := lapply ( .SD , mean , na.rm = TRUE ), .SDcols = cols , by = fe1 ] M <- length ( unique ( model_dt $ fe1 )) # Means by FE2: setkey ( model_dt , fe2 ) model_dt [, paste0 ( "mean_" , cols , "_fe2" ) := lapply ( .SD , mean , na.rm = TRUE ), .SDcols = cols , by = fe2 ] Y <- length ( unique ( model_dt $ fe2 )) # Demeaning: model_dt [, "y_tilde" := y - mean_y_fe2 - mean_y_fe1 + mean_y ] g <- function ( i ) { paste0 ( "x" , i , "_tilde" )} LHS <- sapply ( 1 : num_Xs , g ) f <- function ( i ) { paste0 ( "x" , i , " - mean_x" , i , "_fe2 - mean_x" , i , "_fe1 + mean_x" , i ) } RHS <- paste0 ( "list(" , paste ( sapply ( 1 : num_Xs , f ), collapse = ", " ), ")" ) model_dt [, eval ( LHS ) := eval ( parse ( text = RHS ))] x_cols <- grep ( "x\\d{1}_tilde" , names ( model_dt ), value = TRUE ) model_dt <- model_dt [, c ( "y_tilde" , eval ( x_cols ), "fe1" , "fe2" ), with = FALSE ] y <- model_dt $ y_tilde X <- model_dt [, x_cols , with = FALSE ] cluster_vec <- model_dt $ fe1 rm ( model_dt ) m <- RcppEigen :: fastLm ( X , y ) names ( m $ coefficients ) <- Xs ############################################## DoF <- m $ df.residual - ( M - 1 ) - ( Y - 1 ) - num_Xs + 1 # No intercept in model. # SEs: if ( cluster ){ N <- length ( cluster_vec ) K <- m $ rank + ( M - 1 ) + ( Y - 1 ) + 1 dfc <- ( M / ( M - 1 )) * (( N - 1 ) / ( N - K )) est_fun <- residuals ( m ) * X dt <- data.table ( est_fun , fe1 = cluster_vec ) setkey ( dt , fe1 ) dt <- dt [, lapply ( .SD , sum ), by = fe1 ][, 2 : ncol ( dt ), with = FALSE ] bread <- solve ( crossprod ( as.matrix ( X ))) * N meat <- crossprod ( as.matrix ( dt )) / N m $ se <- as.numeric ( sqrt ( dfc * 1 / N * diag ( bread %*% meat %*% bread ))) message ( "SEs Clustered on First FE" ) } else { m $ se <- m $ se * sqrt ( m $ df.residual / DoF ) message ( "SEs Not Clustered" ) } # Correcting degrees of freedom: m $ df.residual <- DoF return ( m ) }

I then check equivalence of the estimates and benchmark this function against both the `felm`

and `lm`

functions.

# CHECKING EQUIVALENCE: options ( digits = 10 ) pkgs <- c ( "RcppEigen" , "data.table" , "lfe" , "testthat" , "microbenchmark" ) sapply ( pkgs , require , character.only = TRUE ) t_dc <- system.time ( m <- TwoWayFE ( y = "A" , Xs = c ( "B1" , "B2" ), fe1 = "cell" , fe2 = "year" , dt = fake_dt , cluster = FALSE )) library ( lfe ) t_felm <- system.time ( m_felm <- felm ( A ~ B 1 + B 2 | factor ( cell ) + factor ( year ), data = fake_dt )) t_base <- system.time ( m_base <- lm ( A ~ B 1 + B 2 + factor ( cell ) + factor ( year ), data = fake_dt )) # Comparing Results: summary ( m ) $ coefficients [ 1 : 2 , 1 : 2 ] # Estimate Std. Error # B1 -0.002497459836 0.004086039606 # B2 0.001355440208 0.002022403917 summary ( m_felm ) $ coefficients [ 1 : 2 , 1 : 2 ] # Estimate Std. Error # B1 -0.002497459836 0.004086039606 # B2 0.001355440208 0.002022403917 summary ( m_base ) $ coefficients [ 2 : 3 , 1 : 2 ] # Estimate Std. Error # B1 -0.002497459836 0.004086039606 # B2 0.001355440208 0.002022403917 # Checking equivalence: b 1 _ dc <- summary ( m ) $ coefficients [ 1 , 1 ] b 1 _ felm <- summary ( m_felm ) $ coefficients [ 1 , 1 ] expect_equivalent ( b 1 _ dc , b 1 _ felm ) b 1 _ lm <- summary ( m_base ) $ coefficients [ 1 , 1 ] expect_equivalent ( b 1 _ dc , b 1 _ lm )

Here’s the punchline:

# BENCHMARKING: # More time-consuming: op <- microbenchmark ( DC = TwoWayFE ( y = "A" , Xs = c ( "B1" , "B2" ), fe1 = "cell" , fe2 = "year" , dt = fake_dt , cluster = FALSE ), FELM = felm ( A ~ B 1 + B 2 | factor ( cell ) + factor ( year ), data = fake_dt ), BASE = lm ( A ~ B 1 + B 2 + factor ( cell ) + factor ( year ), data = fake_dt ), times = 100L ) print ( op ) # Unit: milliseconds # expr min mean median max neval # DC 44.697352 63.51898258 54.3942015 162.108703 100 # FELM 502.789730 585.82423354 576.5496225 771.906130 100 # BASE 696.929222 758.06220078 737.3307755 939.178530 100 ggplot ( data = op , aes ( x = expr , y = log ( time ))) + geom_boxplot () + xlab ( "Function" ) + ylab ( "Log(time)" ) + theme_bw ()