【问题标题】:Randomly delete and add rows based on ID根据ID随机删除和添加行
【发布时间】:2020-12-16 19:06:52
【问题描述】:

有谁知道如何根据 ID 随机删除和添加行?这是一个可重现的示例:

> y <- rnorm(20)
> x <- rnorm(20)
> z <- rep(1:5, 4)
> w <- rep(1:4, each=5)
> data.frame(id=z,cluster=w,x=x,y=y) #this is a balanced dataset
   id cluster           x           y
1   1       1  0.30003855  0.65325768
2   2       1 -1.00563626 -0.12270866
3   3       1  0.01925927 -0.41367651
4   4       1 -1.07742065 -2.64314895
5   5       1  0.71270333 -0.09294102
6   1       2  1.08477509  0.43028470
7   2       2 -2.22498770  0.53539884
8   3       2  1.23569346 -0.55527835
9   4       2 -1.24104450  1.77950291
10  5       2  0.45476927  0.28642442
11  1       3  0.65990264  0.12631586
12  2       3 -0.19988983  1.27226678
13  3       3 -0.64511396 -0.71846622
14  4       3  0.16532102 -0.45033862
15  5       3  0.43881870  2.39745248
16  1       4  0.88330282  0.01112919
17  2       4 -2.05233698  1.63356842
18  3       4 -1.63637927 -1.43850664
19  4       4  1.43040234 -0.19051680
20  5       4  1.04662885  0.37842390

根据ID随机添加和删除一些数据后,数据集如下所示,观察总数应与上述一致:

            id   cluster   x     y
       1     1       1  0.895 -0.659 
       2     2       1 -0.160 -0.366 
       3     1       2 -0.528 -0.294 
       4     2       2 -0.919  0.362 
       5     3       2 -0.901 -0.467 
       6     1       3  0.275  0.134 
       7     2       3  0.423  0.534 
       8     3       3  0.929 -0.953 
       9     4       3  1.67   0.668 
      10     5       3  0.286  0.0872
      11     1       4 -0.373 -0.109 
      12     2       4  0.289  0.299 
      13     3       4 -1.43  -0.677 
      14     4       4 -0.884  1.70  
      15     5       4  1.12   0.386 
      16     1       5 -0.723  0.247 
      17     2       5  0.463 -2.59  
      18     3       5  0.234  0.893 
      19     4       5 -0.313 -1.96  
      20     5       5  0.848 -0.0613

【问题讨论】:

    标签: data-manipulation data-generation


    【解决方案1】:

    我们可以通过从新的 id-cluster ID id2 中获取大小为 nrow(d)sample 来采样要从每个集群中删除/添加的行数。然后,我们只需根据该样本大小添加一些行并将簇数加一。

    通过检查显示的预期输出,您可能希望结果中每个集群至少有 2 个 nob。我们可以在函数参数中处理它,并防止与一些stopifnots 的无意义组合。一个repeat循环breaks然后当条件满足时。

    FUN <- function(d, cl.obs=2, min.cl=NA) {
      l.cl <- length(unique(d$cluster))
      if (is.na(min.cl)) min.cl <- l.cl
      stopifnot(cl.obs <= min(table(d$cluster)))
      stopifnot(min.cl <= l.cl + 1)
      stopifnot(cl.obs*min.cl <= nrow(d))
      d$id2 <- Reduce(paste, d[c("id", "cluster")])
      repeat({
        samp <- sample(d$id2, sample(1:nrow(d), 1))
        l <- length(samp)
        if (l == 0) {
          return(d[,-5])
        }
        else {
          a <- cbind(id=1:l, cluster= max(d$cluster) + 1, 
                     matrix(rnorm(l*2),,2, dimnames=list(NULL, letters[24:25])))
          o <- rbind(d[!d$id2 %in% samp, -5], a)
          (cl.tb <- table(o$cluster))
          if (all(cl.tb >= cl.obs) & length(cl.tb) >= min.cl) break
        }
      })
      return(`rownames<-`(o, NULL))
    }
    
    set.seed(42)
    FUN(d)
    #    id cluster           x           y
    # 1   1       1 -0.30663859  1.37095845
    # 2   2       1 -1.78130843 -0.56469817
    # 3   3       1 -0.17191736  0.36312841
    # 4   4       1  1.21467470  0.63286260
    # 5   1       2 -0.43046913 -0.10612452
    # 6   2       2 -0.25726938  1.51152200
    # 7   3       2 -1.76316309 -0.09465904
    # 8   4       2  0.46009735  2.01842371
    # 9   5       2 -0.63999488 -0.06271410
    # 10  1       3  0.45545012  1.30486965
    # 11  2       3  0.70483734  2.28664539
    # 12  3       3  1.03510352 -1.38886070
    # 13  5       3  0.50495512 -0.13332134
    # 14  2       4 -0.78445901 -0.28425292
    # 15  3       4 -0.85090759 -2.65645542
    # 16  4       4 -2.41420765 -2.44046693
    # 17  5       4  0.03612261  1.32011335
    # 18  1       5 -0.43144620 -0.78383894
    # 19  2       5  0.65564788  1.57572752
    # 20  3       5  0.32192527  0.64289931
    

    使用参数:

    set.seed(666)
    FUN(d, cl.obs=1)
    #    id cluster           x           y
    # 1   4       1  1.21467470  0.63286260  ## just one obs in cl. 1
    # 2   2       2 -0.25726938  1.51152200
    # 3   3       2 -1.76316309 -0.09465904
    # 4   5       2 -0.63999488 -0.06271410
    # 5   1       3  0.45545012  1.30486965
    # 6   3       3  1.03510352 -1.38886070
    # 7   5       3  0.50495512 -0.13332134
    # 8   1       4 -1.71700868  0.63595040
    # 9   3       4 -0.85090759 -2.65645542
    # 10  1       5 -0.08365711  0.07771005
    # 11  2       5  0.25683143  2.12925556
    # 12  3       5 -1.07362365  0.63895459
    # 13  4       5 -0.62286788  0.26934743
    # 14  5       5  0.28499111  2.29896933
    # 15  6       5  1.05156653 -1.37464590
    # 16  7       5 -0.25952120  0.66236713
    # 17  8       5  0.02230428  0.48351632
    # 18  9       5 -0.01440929  1.23229183
    # 19 10       5  1.33285534 -1.77762517
    # 20 11       5  0.14842679  0.88552740
    

    数据:

    d <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 
    1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), cluster = c(1L, 1L, 
    1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 
    4L, 4L), x = c(-0.306638594078475, -1.78130843398, -0.171917355759621, 
    1.2146746991726, 1.89519346126497, -0.4304691316062, -0.25726938276893, 
    -1.76316308519478, 0.460097354831271, -0.639994875960119, 0.455450123241219, 
    0.704837337228819, 1.03510352196992, -0.608926375407211, 0.50495512329797, 
    -1.71700867907334, -0.784459008379496, -0.850907594176518, -2.41420764994663, 
    0.0361226068922556), y = c(1.37095844714667, -0.564698171396089, 
    0.363128411337339, 0.63286260496104, 0.404268323140999, -0.106124516091484, 
    1.51152199743894, -0.0946590384130976, 2.01842371387704, -0.062714099052421, 
    1.30486965422349, 2.28664539270111, -1.38886070111234, -0.278788766817371, 
    -0.133321336393658, 0.635950398070074, -0.284252921416072, -2.65645542090478, 
    -2.44046692857552, 1.32011334573019)), class = "data.frame", row.names = c(NA, 
    -20L))
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2012-10-10
      相关资源
      最近更新 更多