【发布时间】:2015-06-22 15:36:27
【问题描述】:
我有一个数据框,其中包含右侧的列:
lengthArray speed_max
1 4 24, 18, 24, 18
2 10 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
3 4 -999, -999, -999, -999
4 2 -999, -999
5 2 18, 18
6 1 -999
对于这些列,我编写了一个函数来从数组中为数据帧的每一行提取平均值、中值、最大值或最小值,但我觉得这可以更快地完成。这是我得到的:
get_scalar <- function(name, to_return = 1)
{
vec_list = mydata[[name]]
alt_vector = vector(mode = "numeric", length = length(alt_max))
i = 1
# depending on what user wants, return max, min, mean or median
# for each array one per row
if(to_return == 0){
for(entry in alt_max){
alt_vector[i] = max(which(alt_max[i][[1]] != -999))
i = i + 1
}
}else if (to_return==1){
for(entry in alt_max){
alt_vector[i] = min(which(alt_max[i][[1]] != -999))
i = i + 1
}
}
...
#and repeated for two other cases
...
#then finally return the results as numeric vector
alt_vector = as.numeric(alt_vector)
}
此函数的预期/期望输出是一个数值向量,每一行对应于数据框中每个数组行的期望测量值。因此,例如,如果我运行get_scalar("speed_max", to_return = 0),我希望根据我上面粘贴的数据返回一个数字向量,第一行是(24, 2, NA....),因为第一行的“speed_max”数组的最大值是24,第二行的最大值是“ speed_max" 数组为 2,第 3 行不包含任何相关数据(-999 表示省略)。
我无法找到一种方法来使用 sapply 编写此代码以访问每个单元格列表的第一个成员。例如以下语法错误:
> gg = max(mydata[[speed_max]][[1]])
Error in (function(x, i, exact) if (is.matrix(i)) as.matrix(x)[[i]] else .subset2(x, :
object 'speed_max' not found
如果我尝试像这样巧妙地重写,我似乎无法访问每行的单个数组。例如,这个函数只是打印出许多 0:
get_scalar_sapply <- function(name, to_return = 1)
{
vec_list = mydata[[name]]
alt_vector = vector(mode = "numeric", length = length(alt_max))
if(to_return == 1){
#alt_vector =sapply(alt_vector, function(x) max(which(x[[1]] != -999)))
alt_vector = sapply(alt_vector, function(x) print(x[[1]]))
}
alt_vector = as.numeric(alt_vector)
}
附录,作为dput(mydata) 的请求输出
> dput(head(mydata))
structure(list(endo = c(20216392L, 20167990L, 20211929L, 20214641L,
20206551L, 20178293L), lengthArray = c(4L, 10L, 4L, 2L, 2L, 1L
), sport = list(c(24, 18, 24, 18), c("2", "2", "2", "2", "2",
"2", "2", "2", "2", "2"), c("-999", "-999", "-999", "-999"),
c("-999", "-999"), c("18", "18"), "-999"), local_start_time = list(
c(NA_real_, NA_real_, NA_real_, NA_real_), c("u'2015-05-03T17:14:13.000Z'",
"u'2015-05-03T17:13:22.000Z'", "u'2015-05-03T17:14:13.000Z'",
"u'2015-05-03T17:13:22.000Z'", "u'2015-05-03T17:14:13.000Z'",
"u'2015-05-03T17:13:22.000Z'", "u'2015-05-03T17:14:13.000Z'",
"u'2015-05-03T17:13:22.000Z'", "u'2015-05-03T17:14:13.000Z'",
"u'2015-05-03T17:13:22.000Z'"), c("u'2015-02-25T10:02:10.000Z'",
"u'2015-02-02T22:37:34.000Z'", "u'2015-02-25T10:02:10.000Z'",
"u'2015-02-02T22:37:34.000Z'"), c("u'2015-02-02T18:28:23.000Z'",
"u'2015-02-02T18:28:23.000Z'"), c("u'2015-02-02T10:42:27.000Z'",
"u'2015-02-02T10:42:27.000Z'"), "u'2015-01-31T10:35:54.000Z'"),
distance = list(c(-999, 1.32598698139191, -999, 1.32598698139191
), c("-999", "-999", "-999", "-999", "-999", "-999", "-999",
"-999", "-999", "-999"), c("15.499165534973145", "-999",
"15.499165534973145", "-999"), c("6.071850776672363", "6.071850776672363"
), c("-999", "-999"), "-999"), duration = list(c(4, 1103,
4, 1103), c("8.0", "15.0", "8.0", "15.0", "8.0", "15.0",
"8.0", "15.0", "8.0", "15.0"), c("19492.0", "56.0", "19492.0",
"56.0"), c("1936.0", "1936.0"), c("3.0", "3.0"), "4083.49"),
speed_avg = list(c(-999, 4.32779069175962, -999, 4.32779069175962
), c("-999", "-999", "-999", "-999", "-999", "-999", "-999",
"-999", "-999", "-999"), c("2.862558789549729", "-999", "2.862558789549729",
"-999"), c("11.290631609514724", "11.290631609514724"), c("-999",
"-999"), "-999"), altitude_max = list(c(-999, 366, -999,
366), c("-999", "-999", "-999", "-999", "-999", "-999", "-999",
"-999", "-999", "-999"), c("335.5", "-999", "335.5", "-999"
), c("520.0", "520.0"), c("624.0", "624.0"), "-999"), altitude_min = list(
c(-999, 223, -999, 223), c("-999", "-999", "-999", "-999",
"-999", "-999", "-999", "-999", "-999", "-999"), c("-156.0",
"-999", "-156.0", "-999"), c("453.0", "453.0"), c("624.0",
"624.0"), "-999"), speed_max = list(c(-999, 5.01253,
-999, 5.01253), c("-999", "-999", "-999", "-999", "-999",
"-999", "-999", "-999", "-999", "-999"), c("66.8202", "-999",
"66.8202", "-999"), c("19.8268", "19.8268"), c("-999", "-999"
), "-999"), ascent = list(c(-999, 140, -999, 140), c("-999",
"-999", "-999", "-999", "-999", "-999", "-999", "-999", "-999",
"-999"), c("-999", "-999", "-999", "-999"), c("173.0", "173.0"
), c("-999", "-999"), "-999"), descent = list(c(-999, 272,
-999, 272), c("-999", "-999", "-999", "-999", "-999", "-999",
"-999", "-999", "-999", "-999"), c("-999", "-999", "-999",
"-999"), c("174.0", "174.0"), c("-999", "-999"), "-999"),
title = list(c(-999, -999, -999, -999), c("-999", "-999",
"-999", "-999", "-999", "-999", "-999", "-999", "-999", "-999"
), c("-999", "-999", "-999", "-999"), c("-999", "-999"),
c("-999", "-999"), "-999"), num_runs = c(0L, 0L, 0L,
0L, 0L, 0L), percent_runs = c(0, 0, 0, 0, 0, 0)), .Names = c("endo",
"lengthArray", "sport", "local_start_time", "distance", "duration",
"speed_avg", "altitude_max", "altitude_min", "speed_max", "ascent",
"descent", "title", "num_runs", "percent_runs"), row.names = c(NA,
6L), class = "data.frame")
【问题讨论】:
标签: r dataframe vectorization