【问题标题】：How (in a vectorized manner) to retrieve single value quantities from data frame cells containing numeric arrays如何（以矢量化方式）从包含数值数组的数据框单元格中检索单值数量
【发布时间】：2015-06-22 15:36:27
【问题描述】：

我有一个数据框，其中包含右侧的列：

  lengthArray                    speed_max
1           4               24, 18, 24, 18
2          10 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
3           4       -999, -999, -999, -999
4           2                   -999, -999
5           2                       18, 18
6           1                         -999

对于这些列，我编写了一个函数来从数组中为数据帧的每一行提取平均值、中值、最大值或最小值，但我觉得这可以更快地完成。这是我得到的：

get_scalar <- function(name, to_return = 1)
{

  vec_list = mydata[[name]]
  alt_vector = vector(mode = "numeric", length = length(alt_max))
  i = 1
  # depending on what user wants, return max, min, mean or median 
  # for each array one per row
  if(to_return == 0){
    for(entry in alt_max){
      alt_vector[i] = max(which(alt_max[i][[1]] != -999))
      i = i + 1
    }
  }else if (to_return==1){
    for(entry in alt_max){
      alt_vector[i] = min(which(alt_max[i][[1]] != -999))
      i = i + 1
    }
  }
  ...
  #and repeated for two other cases
  ...
  #then finally return the results as numeric vector
  alt_vector = as.numeric(alt_vector)   

}

此函数的预期/期望输出是一个数值向量，每一行对应于数据框中每个数组行的期望测量值。因此，例如，如果我运行get_scalar("speed_max", to_return = 0)，我希望根据我上面粘贴的数据返回一个数字向量，第一行是(24, 2, NA....)，因为第一行的“speed_max”数组的最大值是24，第二行的最大值是“ speed_max" 数组为 2，第 3 行不包含任何相关数据（-999 表示省略）。

我无法找到一种方法来使用 sapply 编写此代码以访问每个单元格列表的第一个成员。例如以下语法错误：

> gg = max(mydata[[speed_max]][[1]])
Error in (function(x, i, exact) if (is.matrix(i)) as.matrix(x)[[i]] else .subset2(x,  : 
  object 'speed_max' not found

如果我尝试像这样巧妙地重写，我似乎无法访问每行的单个数组。例如，这个函数只是打印出许多 0：

get_scalar_sapply <- function(name, to_return = 1)
{
  vec_list = mydata[[name]]
  alt_vector = vector(mode = "numeric", length = length(alt_max))
  if(to_return == 1){
      #alt_vector =sapply(alt_vector, function(x)  max(which(x[[1]] != -999)))
    alt_vector = sapply(alt_vector, function(x)  print(x[[1]]))
  }
  alt_vector = as.numeric(alt_vector)   

}

附录，作为`dput(mydata)` 的请求输出

> dput(head(mydata))
structure(list(endo = c(20216392L, 20167990L, 20211929L, 20214641L, 
20206551L, 20178293L), lengthArray = c(4L, 10L, 4L, 2L, 2L, 1L
), sport = list(c(24, 18, 24, 18), c("2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2"), c("-999", "-999", "-999", "-999"), 
    c("-999", "-999"), c("18", "18"), "-999"), local_start_time = list(
    c(NA_real_, NA_real_, NA_real_, NA_real_), c("u'2015-05-03T17:14:13.000Z'", 
    "u'2015-05-03T17:13:22.000Z'", "u'2015-05-03T17:14:13.000Z'", 
    "u'2015-05-03T17:13:22.000Z'", "u'2015-05-03T17:14:13.000Z'", 
    "u'2015-05-03T17:13:22.000Z'", "u'2015-05-03T17:14:13.000Z'", 
    "u'2015-05-03T17:13:22.000Z'", "u'2015-05-03T17:14:13.000Z'", 
    "u'2015-05-03T17:13:22.000Z'"), c("u'2015-02-25T10:02:10.000Z'", 
    "u'2015-02-02T22:37:34.000Z'", "u'2015-02-25T10:02:10.000Z'", 
    "u'2015-02-02T22:37:34.000Z'"), c("u'2015-02-02T18:28:23.000Z'", 
    "u'2015-02-02T18:28:23.000Z'"), c("u'2015-02-02T10:42:27.000Z'", 
    "u'2015-02-02T10:42:27.000Z'"), "u'2015-01-31T10:35:54.000Z'"), 
    distance = list(c(-999, 1.32598698139191, -999, 1.32598698139191
    ), c("-999", "-999", "-999", "-999", "-999", "-999", "-999", 
    "-999", "-999", "-999"), c("15.499165534973145", "-999", 
    "15.499165534973145", "-999"), c("6.071850776672363", "6.071850776672363"
    ), c("-999", "-999"), "-999"), duration = list(c(4, 1103, 
    4, 1103), c("8.0", "15.0", "8.0", "15.0", "8.0", "15.0", 
    "8.0", "15.0", "8.0", "15.0"), c("19492.0", "56.0", "19492.0", 
    "56.0"), c("1936.0", "1936.0"), c("3.0", "3.0"), "4083.49"), 
    speed_avg = list(c(-999, 4.32779069175962, -999, 4.32779069175962
    ), c("-999", "-999", "-999", "-999", "-999", "-999", "-999", 
    "-999", "-999", "-999"), c("2.862558789549729", "-999", "2.862558789549729", 
    "-999"), c("11.290631609514724", "11.290631609514724"), c("-999", 
    "-999"), "-999"), altitude_max = list(c(-999, 366, -999, 
    366), c("-999", "-999", "-999", "-999", "-999", "-999", "-999", 
    "-999", "-999", "-999"), c("335.5", "-999", "335.5", "-999"
    ), c("520.0", "520.0"), c("624.0", "624.0"), "-999"), altitude_min = list(
        c(-999, 223, -999, 223), c("-999", "-999", "-999", "-999", 
        "-999", "-999", "-999", "-999", "-999", "-999"), c("-156.0", 
        "-999", "-156.0", "-999"), c("453.0", "453.0"), c("624.0", 
        "624.0"), "-999"), speed_max = list(c(-999, 5.01253, 
    -999, 5.01253), c("-999", "-999", "-999", "-999", "-999", 
    "-999", "-999", "-999", "-999", "-999"), c("66.8202", "-999", 
    "66.8202", "-999"), c("19.8268", "19.8268"), c("-999", "-999"
    ), "-999"), ascent = list(c(-999, 140, -999, 140), c("-999", 
    "-999", "-999", "-999", "-999", "-999", "-999", "-999", "-999", 
    "-999"), c("-999", "-999", "-999", "-999"), c("173.0", "173.0"
    ), c("-999", "-999"), "-999"), descent = list(c(-999, 272, 
    -999, 272), c("-999", "-999", "-999", "-999", "-999", "-999", 
    "-999", "-999", "-999", "-999"), c("-999", "-999", "-999", 
    "-999"), c("174.0", "174.0"), c("-999", "-999"), "-999"), 
    title = list(c(-999, -999, -999, -999), c("-999", "-999", 
    "-999", "-999", "-999", "-999", "-999", "-999", "-999", "-999"
    ), c("-999", "-999", "-999", "-999"), c("-999", "-999"), 
        c("-999", "-999"), "-999"), num_runs = c(0L, 0L, 0L, 
    0L, 0L, 0L), percent_runs = c(0, 0, 0, 0, 0, 0)), .Names = c("endo", 
"lengthArray", "sport", "local_start_time", "distance", "duration", 
"speed_avg", "altitude_max", "altitude_min", "speed_max", "ascent", 
"descent", "title", "num_runs", "percent_runs"), row.names = c(NA, 
6L), class = "data.frame")

【问题讨论】：

标签： r dataframe vectorization

【解决方案1】：

看起来您正试图从列表中的每个条目中获取摘要函数，而忽略设置为 -999 的元素。你可以这样做：

get_scalar <- function(name, FUN=max) {
  sapply(mydata[,name], function(x) if(all(x == -999)) NA else FUN(as.numeric(x[x != -999])))
}

请注意，我通过传递一个应用于每个列表元素的实际函数而不是对应于函数的数字代码来稍微更改了您的函数。这使您的函数更加灵活，因为它现在可以轻松地接受任何处理函数。

让我们看一下提供的mydata 列表中的一个示例：

# Look at the list:
mydata$speed_max
# [[1]]
# [1] -999.00000    5.01253 -999.00000    5.01253
# 
# [[2]]
#  [1] "-999" "-999" "-999" "-999" "-999" "-999" "-999" "-999" "-999" "-999"
# 
# [[3]]
# [1] "66.8202" "-999"    "66.8202" "-999"   
# 
# [[4]]
# [1] "19.8268" "19.8268"
# 
# [[5]]
# [1] "-999" "-999"
# 
# [[6]]
# [1] "-999"

# Minimum element in each row
get_scalar("speed_max", min)
# [1]  5.01253       NA 66.82020 19.82680       NA       NA

# Number set (NA if none)
get_scalar("speed_max", length)
# [1]  2 NA  2  2 NA NA

【讨论】：

附录，作为dput(mydata) 的请求输出

附录，作为`dput(mydata)` 的请求输出