# Faster Two-Way Fixed Effects Estimator in R

In a current project, we need to run linear models on a large data frame (~70 million rows) with fixed effects for both place (e.g., grid-cell) and time (e.g., year) – a common specification for difference-in-differences models with multiple periods and groups. The base lm package in R choked on the task. The excellent lfe package by Gaure can estimate the models but requires long run times.

In attempt to speed up our estimation routines, I built a new function that uses the data.table and RcppEigen packages to partial out the fixed effects and then run OLS on the de-meaned data. I include the code below in hopes that it might be useful to others facing a similar task. Note: this assumes that your panel is balanced.

To start, we need to create some fake data for analysis:

T = 6; N = 10^4
fake_dt <- data.table(
A = sample(c(0,1), N * T, replace = TRUE),
B1 = sample(c(0,1), N * T, replace = TRUE),
B2 = rnorm(N),
year = rep(1980:(1979 + T), N),
cell = rep(1:100, each = T))

TwoWayFE <- function(y, Xs, fe1, fe2, dt, cluster = FALSE) {
pkgs <- c("data.table", "RcppEigen")
sapply(pkgs, require, character.only = TRUE)

keep_cols <- c(y, Xs, fe1, fe2)
model_dt <- dt[, keep_cols, with = FALSE]; rm(keep_cols)
model_dt <- na.omit(model_dt)

num_Xs <- length(Xs)
new_names <- c("y", paste0("x", 1:num_Xs), "fe1", "fe2")
setnames(model_dt, 1:ncol(model_dt), new_names)

# Sample Means:
cols <- new_names[!grepl("fe", new_names)]
model_dt[, paste0("mean_", cols) :=
lapply(.SD, mean, na.rm = TRUE), .SDcols = cols]

# Means by FE1:
setkey(model_dt, fe1)
model_dt[,
paste0("mean_", cols, "_fe1") :=
lapply(.SD, mean, na.rm = TRUE), .SDcols = cols, by = fe1]
M <- length(unique(model_dt$fe1)) # Means by FE2: setkey(model_dt, fe2) model_dt[, paste0("mean_", cols, "_fe2") := lapply(.SD, mean, na.rm = TRUE), .SDcols = cols, by = fe2] Y <- length(unique(model_dt$fe2))

# Demeaning:
model_dt[, "y_tilde" := y - mean_y_fe2 - mean_y_fe1 + mean_y]

g <- function(i) {paste0("x",i,"_tilde")}
LHS <- sapply(1:num_Xs, g)

f <- function(i) {
paste0("x",i," - mean_x",i,"_fe2 - mean_x",i,"_fe1 + mean_x", i)
}
RHS <- paste0("list(",paste(sapply(1:num_Xs, f), collapse = ", "), ")")

model_dt[, eval(LHS) := eval(parse(text = RHS))]

x_cols <- grep("x\\d{1}_tilde", names(model_dt), value = TRUE)
model_dt <- model_dt[, c("y_tilde", eval(x_cols), "fe1", "fe2"),
with = FALSE]

y <- model_dt$y_tilde X <- model_dt[, x_cols, with = FALSE] cluster_vec <- model_dt$fe1
rm(model_dt)

m <- RcppEigen::fastLm(X, y)
names(m$coefficients) <- Xs ############################################## DoF <- m$df.residual - (M - 1) - (Y - 1) - num_Xs + 1
# No intercept in model.

# SEs:
if(cluster){
N <- length(cluster_vec)
K <- m$rank + (M - 1) + (Y - 1) + 1 dfc <- (M/(M - 1)) * ((N - 1)/(N - K)) est_fun <- residuals(m) * X dt <- data.table(est_fun, fe1 = cluster_vec) setkey(dt, fe1) dt <- dt[, lapply(.SD, sum), by = fe1][, 2:ncol(dt), with = FALSE] bread <- solve(crossprod(as.matrix(X))) * N meat <- crossprod(as.matrix(dt)) / N m$se <- as.numeric(sqrt(dfc * 1/N * diag(bread %*% meat %*% bread)))
message("SEs Clustered on First FE")
} else {
m$se <- m$se * sqrt(m$df.residual / DoF) message("SEs Not Clustered") } # Correcting degrees of freedom: m$df.residual <- DoF

return(m)
}


I then check equivalence of the estimates and benchmark this function against both the felm and lm functions.

# CHECKING EQUIVALENCE:
options(digits = 10)

pkgs <- c("RcppEigen", "data.table", "lfe", "testthat", "microbenchmark")
sapply(pkgs, require, character.only = TRUE)

t_dc <- system.time(
m <- TwoWayFE(y = "A", Xs = c("B1", "B2"),
fe1 = "cell", fe2 = "year",
dt = fake_dt,
cluster = FALSE))

library(lfe)
t_felm <- system.time(
m_felm <- felm(A ~ B1 + B2 | factor(cell) + factor(year),
data = fake_dt))

t_base <- system.time(
m_base <- lm(A ~ B1 + B2 + factor(cell) + factor(year),
data = fake_dt))

# Comparing Results:
summary(m)$coefficients[1:2, 1:2] # Estimate Std. Error # B1 -0.002497459836 0.004086039606 # B2 0.001355440208 0.002022403917 summary(m_felm)$coefficients[1:2, 1:2]
#           Estimate     Std. Error
# B1 -0.002497459836 0.004086039606
# B2  0.001355440208 0.002022403917

summary(m_base)$coefficients[2:3, 1:2] # Estimate Std. Error # B1 -0.002497459836 0.004086039606 # B2 0.001355440208 0.002022403917 # Checking equivalence: b1_dc <- summary(m)$coefficients[1,1]
b1_felm <- summary(m_felm)$coefficients[1,1] expect_equivalent(b1_dc, b1_felm) b1_lm <- summary(m_base)$coefficients[1,1]
expect_equivalent(b1_dc, b1_lm)


Here’s the punchline: # BENCHMARKING:

# More time-consuming:
op <- microbenchmark(
DC = TwoWayFE(y = "A", Xs = c("B1", "B2"), fe1 = "cell", fe2 = "year",
dt = fake_dt, cluster = FALSE),
FELM = felm(A ~ B1 + B2 | factor(cell) + factor(year), data = fake_dt),
BASE = lm(A ~ B1 + B2 + factor(cell) + factor(year), data = fake_dt),
times = 100L)

print(op)
# Unit: milliseconds
#  expr        min         mean      median        max neval
#    DC  44.697352  63.51898258  54.3942015 162.108703   100
#  FELM 502.789730 585.82423354 576.5496225 771.906130   100
#  BASE 696.929222 758.06220078 737.3307755 939.178530   100

ggplot(data = op, aes(x = expr, y = log(time))) +
geom_boxplot() +
xlab("Function") +
ylab("Log(time)") +
theme_bw()