【问题标题】:How to calculates the descriptive statistics for both numeric and categorical variables in R?如何计算 R 中数值变量和分类变量的描述性统计量?
【发布时间】:2021-06-11 02:32:18
【问题描述】:

我正在尝试编写一个函数来计算数值和分类变量(因子)的描述性统计。 对于数值型变量,计算均值(MEAN)、中位数(MEDIAN)、标准差(SD),并计算缺失值个数(NMiss)。 对于字符变量,应将变量各级别内的计数制表并统计缺失值的个数。

起始输入数据为:

   ID GLUC TGL HDL LDL  HRT MAMM SMOKE
1   A   88  NA  32  99    Y <NA>  ever
2   B   NA 150  60  NA <NA>   no never
3   C  110  NA  NA 120    N <NA>  <NA>
4   D   NA 200  65 165 <NA>  yes never

我希望它看起来像这样:

> table1 (dat=patient, numvar=c("TGL", "HDL", "LDL"), charvar=c("HRT", "MAMM"))
$numericStats
 varName MEAN   MEDIAN SD      NMiss
1 TGL 180.66667 180.0 23.03620 4
2 HDL 55.66667  62.5  19.00175 4
3 LDL 160.28571 165.0 40.06126 3
$FactorStats
  varName group  count
1   HRT   N       2
2         Y       3
3         NMiss   5
4   MAMM  no      2
5         yes     4
6        NMiss    4

这是我目前的代码:

#numericstats
    findnum = function(dat, numvar){
      numstats=data.frame()
      for (i in length(numvar[])){
        var_select = dat[[numvar[i]]]
        mean_value = round(mean(var_select, na.rm=T),2)
        median_value = round(median(var_select, na.rm=T),2)
        SD = round(sd(var_select, na.rm=T),2)
        N = length(var_select[!is.na(var_select)])
        N_miss = length(var_select[is.na(var_select)])
        numstats = 
          cbind(varname = numvar, mean = mean_value, median = median_value, sd = SD, nmissing = N_miss) 
      } 
      return(numstats)
    }
    findnum(dat=patient, numvar=c("TGL","HDL","LDL"))
    
    #factorstats
    findfactor = function(dat, charvar){
      factstats=data.frame()
      for (i in length(charvar[])){
        var_select = dat[[charvar[i]]]
        count = length(charvar)
        group = charvar
        factstats = 
          cbind(varname = charvar, group = charvar, count = count) 
      } 
      return(factstats)
    }
    findfactor(dat=patient, charvar=c("MAMM","SMOKE"))
    
    #full function
    table1 = function(dat, numvar, charvar){
      for (i in 1:length(dat)){
        if (!is.numeric(i))
          numericstats = findnum(dat, i)
        else factorstats = findfactor(dat, i)
        return(data.frame(numericstats, factorstats))
      }
    }

【问题讨论】:

    标签: r function statistics character numeric


    【解决方案1】:

    这是使用lapply 的一种方法:

    table1 <- function(df, numvar, charvar) {
      list(numericStats = cbind(VarName = numvar,do.call(rbind, 
            lapply(df[numvar], function(x) {
        data.frame(MEAN = mean(x, na.rm = TRUE), MEDIAN = median(x, na.rm = TRUE), 
                  SD = sd(x, na.rm = TRUE), NMiss = sum(!is.na(x)))
      }))), 
      FactorStats = do.call(rbind, lapply(charvar, function(x) {
        tab <- stack(c(table(df[[x]]), Nmiss = sum(is.na(df[[x]]))))[2:1]
        names(tab) <- c('group', 'count')
        cbind(Varname = x, tab)
      })))
    }
    
    table1(patient, numvar=c("TGL", "HDL", "LDL"), charvar=c("HRT", "MAMM"))
    
    #$numericStats
    #    VarName  MEAN MEDIAN   SD NMiss
    #TGL     TGL 175.0    175 35.4     2
    #HDL     HDL  52.3     60 17.8     3
    #LDL     LDL 128.0    120 33.7     3
    
    #$FactorStats
    #  Varname group count
    #1     HRT     N     1
    #2     HRT     Y     1
    #3     HRT Nmiss     2
    #4    MAMM    no     1
    #5    MAMM   yes     1
    #6    MAMM Nmiss     2
    

    数据

    patient <- structure(list(ID = c("A", "B", "C", "D"), GLUC = c(88L, NA, 
    110L, NA), TGL = c(NA, 150L, NA, 200L), HDL = c(32L, 60L, NA, 
    65L), LDL = c(99L, NA, 120L, 165L), HRT = c("Y", NA, "N", NA), 
        MAMM = c(NA, "no", NA, "yes"), SMOKE = c("ever", "never", 
        NA, "never")), row.names = c(NA, -4L), class = "data.frame")
    

    【讨论】:

      猜你喜欢
      • 2022-08-06
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多