【发布时间】:2015-07-28 13:58:39
【问题描述】:
我正在尝试将一些数据从一个数据帧重新排列到另一个数据帧。我认为我需要的解决方案将包含在 plyr 包中,但我还没有找到完整的解决方案。
大纲
我有一系列由不同数量的点组成的横断面。每个横断面可以分成三个点组成的非独立组,形成一个Leg。
输入数据我已经给出了每个横断面上每个点的坐标:
# Subset of Points data
structure(list(Transect = structure(c(73L, 73L, 73L, 73L, 73L, 73L, 72L, 72L, 72L, 72L, 72L, 72L, 23L, 23L, 23L, 14L, 14L, 14L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L),
.Label = c("B", "D", "E", "F", "G", "L1", "L2", "L3", "L4", "L5", "L9", "S101", "S105", "S109", "S116", "S117", "S118", "S119", "S121", "S122", "S123", "S124", "S125", "S126", "T001", "T002", "T003", "T004", "T006", "T007", "T008", "T009", "T010", "T011", "T012", "T013", "T014", "T015", "T016", "T017", "T018", "T019", "T022", "T023", "T024", "T026", "T028", "T029", "T030", "T031", "T032", "T033", "T035", "T039", "T040", "T043", "T049", "T050", "T051", "T056", "T060", "T061", "T062", "T063", "T065", "T066", "T067", "T068", "T072", "T073", "T074", "T075", "T076", "T077", "T078", "T079", "T082N", "T083", "T087", "T088", "T092", "T093", "T095", "T096", "T097"),
class = "factor"),
Point = c(1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21),
x = c(38.53, 38.53409, 38.53818, 38.53396, 38.52984, 38.53006, 38.45, 38.44936, 38.44942, 38.45324, 38.45743, 38.45382, 38.29102, 38.29013, 38.28935, 37.7798, 37.7803, 37.78109, 38.08238, 38.07932, 38.07534, 38.07143, 38.06737, 38.06339, 38.0596, 38.05605, 38.05261, 38.0489, 38.0444, 38.04113, 38.03668, 38.03237, 38.02786, 38.0234, 38.01895, 38.01524, 38.01481, 38.01465, 38.013),
y = c(4.23, 4.22811, 4.22622, 4.22465, 4.22281, 4.22553, 4.22, 4.22445, 4.22897, 4.22659, 4.22481, 4.22239, 5.37832, 5.37391, 5.36949, 5.0068, 5.01126, 5.0157, 4.95384, 4.95693, 4.95914, 4.96122, 4.96315, 4.96527, 4.96772, 4.97052, 4.97344, 4.97601, 4.97695, 4.97998, 4.98097, 4.98002, 4.97972, 4.98019, 4.98, 4.98272, 4.98715, 4.99165, 4.9958)),
.Names = c("Transect", "Point", "x", "y"),
row.names = c(NA, -39L),
class = "data.frame")
以及每个Transect中每个Leg的身份
# Subset of Legs IDs
structure(list(Transect = structure(c(73L, 73L, 73L, 72L, 72L, 72L, 23L, 14L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L),
.Label = c("B", "D", "E", "F", "G", "L1", "L2", "L3", "L4", "L5", "L9", "S101", "S105", "S109", "S116", "S117", "S118", "S119", "S121", "S122", "S123", "S124", "S125", "S126", "T001", "T002", "T003", "T004", "T006", "T007", "T008", "T009", "T010", "T011", "T012", "T013", "T014", "T015", "T016", "T017", "T018", "T019", "T022", "T023", "T024", "T026", "T028", "T029", "T030", "T031", "T032", "T033", "T035", "T039", "T040", "T043", "T049", "T050", "T051", "T056", "T060", "T061", "T062", "T063", "T065", "T066", "T067", "T068", "T072", "T073", "T074", "T075", "T076", "T077", "T078", "T079", "T082N", "T083", "T087", "T088", "T092", "T093", "T095", "T096", "T097"),
class = "factor"),
Leg = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 1L, 1L, 2L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L),
.Label = c("A-B", "B-C", "C-A", "C-D", "D-E", "E-F", "F-G", "G-H", "H-I", "I-J", "J-K"),
class = "factor")),
.Names = c("Transect", "Leg"),
row.names = c(NA, -18L),
class = "data.frame")
数据类型
所有具有 6 个点的横断面都是三角形的(点在每个顶点和每边的中间),这样:
"Leg" == "A-B" contains "Points" == c(1,2,3)
"Leg" == "B-C" contains "Points" == c(3,4,5)
"Leg" == "C-A" contains "Points" == c(5,6,1)
所有其他横断面都是线性的,例如:
"Leg" == "A-B" contains "Points" == c(1,2,3)
"Leg" == "B-C" contains "Points" == c(3,4,5)
"Leg" == "C-D" contains "Points" == c(5,6,7)
"Leg" == "D-E" contains "Points" == c(7,8,9) etc.
数据部分的解决方案(示例期望结果)
通过将两个输入数据帧子化为单个 Transects,我设法为每种类型的单个 Transects 实现了所需的输出:
# when length(tmp$Points)==6 (i.e. triangular sites)
tmp <- Points[Points$Transect=="T076",]
tmp2 <- Leg[Leg$Transect=="T076",]
for (i in 1:3) {
tmp2$Start_x[i] <- tmp$x[i+i-1]
tmp2$Start_y[i] <- tmp$y[i+i-1]
tmp2$Mid_x[i] <- tmp$x[i+i]
tmp2$Mid_y[i] <- tmp$y[i+i]
tmp2$End_x[i] <- ifelse(i==3,
tmp$x[1],
tmp$x[i+i+1])
tmp2$End_y[i] <- ifelse(i==3,
tmp$y[1],
tmp$y[i+i+1])
}
# when length(tmp$Points)!=6 (i.e. straight line sites)
tmp <- Points[Points$Transect=="L2",]
tmp2 <- Leg[Leg$Transect=="L2",]
for (i in 1:round((length(tmp$Point)-1)/2)) {
tmp2$Start_x[i] <- tmp$x[i+i-1]
tmp2$Start_y[i] <- tmp$y[i+i-1]
tmp2$Mid_x[i] <- tmp$x[i+i]
tmp2$Mid_y[i] <- tmp$y[i+i]
tmp2$End_x[i] <- tmp$x[i+i+1]
tmp2$End_y[i] <- tmp$y[i+i+1]
}
在我看来,应该可以使用ddply 和d_ply 等函数的组合来按每个Transect 分割完整的数据帧,应用相关代码,然后返回Leg 数据框,每个 Leg 都有新列“Start”、“Middle”和“End”、“x”和“y”。
但是我尝试这样做会返回错误,部分原因是:
a) 我无法让ifelse 将三角形横断面(有 6 个点)从线性(任何其他点数)中排序
b) 我无法正确组合 plyr 函数。
代码返回错误示例
library(plyr)
d_ply(BTVs, "Transect", function(a)
ddply(Leg.points, "Transect", function(b)
ifelse(length(a$Point)==6,
# when == 6 (i.e. triangular sites)
for (i in 1:3) {
b$Start_x[i] <- a$x[i+i-1]
b$Start_y[i] <- a$y[i+i-1]
b$Mid_x[i] <- a$x[i+i]
b$Mid_y[i] <- a$y[i+i]
b$End_x[i] <- ifelse(i==3,
a$x[1],
a$x[i+i+1])
b$End_y[i] <- ifelse(i==3,
a$x[1],
a$y[i+i+1])},
# when != 6 (i.e. straight line sites)
for (i in 1:round((length(a$Point)-1)/2)) {
b$Start_x[i] <- a$x[i+i-1]
b$Start_y[i] <- a$y[i+i-1]
b$Mid_x[i] <- a$x[i+i]
b$Mid_y[i] <- a$y[i+i]
b$End_x[i] <- a$x[i+i+1]
b$End_y[i] <- a$y[i+i+1]
})))
有人可以帮忙吗?提前致谢!
【问题讨论】:
标签: r dplyr plyr data-manipulation