考虑按公司拆分数据集,然后使用序列 seq(1, 4000, by=100) 迭代运行 lm 以子集拆分数据框列表:
# BUILD A LIST OF DATA FRAMES (SIZE = 4,000)
firms_df_list <- split(df, df$firm_id)
# FUNCTION TO CALL lm() AND EXTRACT RESULTS
lm_results <- function(n, df) {
model <- lm(sales ~ size, data = df)
res <- summary(model)
p <- res$fstatistic
c(num_of_firms = n,
sales = res$coefficients[2,1],
std_err = res$coefficients[2,2],
t_stat = res$coefficients[2,3],
t_pvalue = res$coefficients[2,4],
r_sq = res$r.squared,
adj_r_sq = res$adj.r.squared,
f_stat = p[['value']],
f_pvalue = unname(pf(p[1], p[2], p[3], lower.tail=FALSE))
)
}
# BUILD MATRIX RESULTS WHERE ROWS ARE MODEL RUNS AND COLS ARE RESULT ESTIMATES
mat_results <- t(sapply(seq(1, 4000, by=100), function(i) {
# COMBINE FIRM SUBSETS BY RANGE
curr_df <- do.call(rbind, firms_df_list[1:i])
# CALL MODEL AND RETRIEVE RESULTS
lm_results(i, curr_df)
}))
# PLOT ALL SALES BETAS AND NUMBER OF FIRMS
plot(mat_results[,"num_of_firms"], mat_results[,"sales"], type="b",
col="blue", lwd=1, pch=16, xlab="Number of Firms", ylab="Sales Estimate")
考虑到年份和月份的细分,考虑将by(类似于split + lapply)按年份和月份与内部split(类似于上述过程)进行子集,其中每次迭代运行所需的模型.然后,在每个月和年级别绑定矩阵以获得最终的大矩阵。注意:lm_results 现在接收两个用于指标月份和年份矩阵列的参数。
# FUNCTION TO CALL lm() AND EXTRACT RESULTS
lm_results <- function(n, df, yy, mm) {
model <- lm(sales ~ size, data = df)
res <- summary(model)
p <- res$fstatistic
c(year = yy,
month = mm,
num_of_firms = n,
sales = res$coefficients[2,1],
std_err = res$coefficients[2,2],
t_stat = res$coefficients[2,3],
t_pvalue = res$coefficients[2,4],
r_sq = res$r.squared,
adj_r_sq = res$adj.r.squared,
f_stat = p[['value']],
f_pvalue = unname(pf(p[1], p[2], p[3], lower.tail=FALSE))
)
}
# BUILD A LIST OF MONTHLY MATRICES BY YEAR
firms_mat_list <- by(df, df$yy, function(sub_year){
# BUILD A LIST OF FIRM MATRICES BY MONTH
month_mat_list <- by(sub_year, sub_year$mm, function(sub_month){
firms_df_list <- split(sub_month, sub_month$firm)
# BUILD MATRIX RESULTS WHERE ROWS ARE MODEL RUNS AND COLS ARE RESULT ESTIMATES
mat_results <- t(sapply(seq(1, 4000, by=100), function(i) {
# COMBINE FIRM SUBSETS BY RANGE
curr_df <- do.call(rbind, firms_df_list[1:i])
# CALL MODEL AND RETRIEVE RESULTS
lm_results(i, curr_df, curr_df$yy[1], curr_df$mm[1])
}))
})
do.call(rbind, month_mat_list)
})
firms_matrix <- do.call(rbind, firms_mat_list)
firms_matrix