【问题标题】:Simulate data and randomly add missing values to dataframe模拟数据并将缺失值随机添加到数据框中
【发布时间】:2018-11-04 19:47:55
【问题描述】:

如何在模拟数据框中随机向某些列或每列添加缺失值(例如,每列随机缺失约 5%),另外,是否有更有效的方法来模拟包含连续列和因子列的数据框?

 #Simulate some data
 N <- 2000
 data <- data.frame(id = 1:2000,age = rnorm(N,18:90),bmi = rnorm(N,15:40),
     chol = rnorm(N,50:350), insulin = rnorm(N,2:40),sbp = rnorm(N, 50:200),
               dbp = rnorm(N, 30:150), sex = c(rep(1, 1000), rep(2, 1000)), 
               smoke = rep(c(1, 2), 1000), educ = sample(LETTERS[1:4]))


 #Manually add some missing values
 data <- data %>%
                    mutate(age = "is.na<-"(age, age <19 | age >88),
                    bmi = "is.na<-"(bmi, bmi >38 | bmi <16),
                    insulin = "is.na<-"(insulin, insulin >38),
                    educ = "is.na<-"(educ, bmi >35))

【问题讨论】:

    标签: r simulation missing-data data-manipulation imputation


    【解决方案1】:

    我认为最好的解决方案是为此使用 mice 包。这是一个专门用于插补的 R 包。它还有一个名为 amputate 的函数,用于将缺失的数据引入 data.frame。

    ampute - 为模拟目的生成缺失数据 该函数以 MCAR、MAR 或 MNAR 方式生成多元缺失数据。

    此解决方案的优点是您可以设置多个参数来模拟丢失的数据。

    ampute(data, prop = 0.5, patterns = NULL, freq = NULL, mech = "MAR",
      weights = NULL, cont = TRUE, type = NULL, odds = NULL,
      bycases = TRUE, run = TRUE)
    

    如您所见,您可以设置缺失值的百分比、缺失数据机制(MCAR 将是您完全随机缺失的选择)和其他几个参数。这个解决方案也很干净,因为它只有 1 行代码。

    【讨论】:

      【解决方案2】:

      这是一种tidyverse 方法,它将为您指定的每一列删除大约 20% 的数据:

      set.seed(1)
      
      # example data
      N <- 20
      data <- data.frame(id = 1:N,
                         age = rnorm(N,18:90),
                         bmi = rnorm(N,15:40),
                         chol = rnorm(N,50:350))
      
      library(tidyverse)
      
      # specify which variables should have missing data and prc of missing data
      c_names = c("age","bmi")
      prc_missing = 0.20
      
      data %>%
        gather(var, value, -id) %>%    # reshape data
        mutate(r = runif(nrow(.)),     # simulate a random number from 0 to 1 for each row
               value = ifelse(var %in% c_names & r <= prc_missing, NA, value)) %>%  # if it's one of the variables you specified and the random number is less than your threshold update to NA
        select(-r) %>%                 # remove random number
        spread(var, value)             # reshape back to original format
      
      #    id      age      bmi     chol
      # 1   1 17.37355 15.91898 49.83548
      # 2   2 19.18364 16.78214 50.74664
      # 3   3 19.16437 17.07456 52.69696
      # 4   4       NA 16.01065 53.55666
      # 5   5 22.32951 19.61983 53.31124
      # 6   6 22.17953 19.94387 54.29250
      # 7   7 24.48743       NA 56.36458
      # 8   8 25.73832 20.52925 57.76853
      # 9   9 26.57578       NA 57.88765
      # 10 10 26.69461 24.41794 59.88111
      # 11 11 29.51178 26.35868 60.39811
      # 12 12       NA 25.89721 60.38797
      # 13 13       NA 27.38767 62.34112
      # 14 14 28.78530 27.94619 61.87064
      # 15 15 33.12493 27.62294 65.43302
      # 16 16 32.95507       NA 66.98040
      # 17 17 33.98381 30.60571 65.63278
      # 18 18 35.94384       NA 65.95587
      # 19 19 36.82122 34.10003 68.56972
      # 20 20 37.59390 34.76318 68.86495
      

      这是一种替代方法,它将为您指定的列删除 20% 的数据:

      set.seed(1)
      
      # example data
      N <- 20
      data <- data.frame(id = 1:N,
                         age = rnorm(N,18:90),
                         bmi = rnorm(N,15:40),
                         chol = rnorm(N,50:350))
      
      library(tidyverse)
      
      # specify which variables should have missing data and prc of missing data
      c_names = c("age","bmi")
      prc_missing = 0.20
      n_remove = prc_missing * nrow(data)
      
      data %>%
        gather(var, value, -id) %>%   # reshape data
        sample_frac(1) %>%            # shuffle rows
        group_by(var) %>%             # for each variables
        mutate(value = ifelse(var %in% c_names & row_number() <= n_remove, NA, value)) %>%  # update to NA top x number of rows if it's one of the variables you specified
        spread(var, value)            # reshape to original format
      
      # # A tibble: 20 x 4
      #      id   age   bmi  chol
      #   <int> <dbl> <dbl> <dbl>
      # 1     1  17.4  15.9  49.8
      # 2     2  19.2  16.8  50.7
      # 3     3  19.2  17.1  52.7
      # 4     4  NA    16.0  53.6
      # 5     5  22.3  NA    53.3
      # 6     6  22.2  19.9  54.3
      # 7     7  24.5  20.8  56.4
      # 8     8  25.7  NA    57.8
      # 9     9  26.6  NA    57.9
      # 10    10  NA    NA    59.9
      # 11    11  NA    26.4  60.4
      # 12    12  NA    25.9  60.4
      # 13    13  29.4  27.4  62.3
      # 14    14  28.8  27.9  61.9
      # 15    15  33.1  27.6  65.4
      # 16    16  33.0  29.6  67.0
      # 17    17  34.0  30.6  65.6
      # 18    18  35.9  31.9  66.0
      # 19    19  36.8  34.1  68.6
      # 20    20  37.6  34.8  68.9
      

      【讨论】:

        【解决方案3】:

        这行得通吗?

        n_rows <- nrow(data)
        perc_missing <- 5 # percentage missing data
        row_missing <- sample(1:n_rows, sample(1:n_rows, round(perc_missing/100 * n_rows,0))) # sample randomly x% of rows
        col_missing <- 1 # define column 
        data[row_missing, col_missing] <- NA # assign missing values
        

        【讨论】:

          猜你喜欢
          • 1970-01-01
          • 2018-03-02
          • 2013-02-15
          • 1970-01-01
          • 2014-03-29
          • 1970-01-01
          • 1970-01-01
          • 2022-01-25
          • 2014-01-19
          相关资源
          最近更新 更多