【发布时间】:2018-02-18 01:40:44
【问题描述】:
我从一个登录 xml 格式的网站进行了网页抓取,并将其变成了一个列表。现在我很难从嵌套列表中提取数据,因为它非常复杂。
这是我的 z2 结构的一部分:
dput(z2)
structure(list(scheduleList = structure(list(
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("011c"), people = list("2"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "011c", status = "-2"),
class = structure(list(name = list("013"), people = list("0"), teacher = structure(list(name = list("B")), .Names = "name", id = "D14")), .Names = c("name", "people", "teacher"), id = "602d", status = "-4"),
class = structure(list(name = list("603"), people = list("6"), teacher = structure(list(name = list("C")), .Names = "name", id = "D31")), .Names = c("name", "people", "teacher"), id = "603", status = "-4")),
.Names = c("class", "class", "class"), id = "1"),
score = structure(list(
class = structure(list(name = list("011c"), people = list("4"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "011", status = "-2"),
class = structure(list(name = list("015c"), people = list("51"), teacher = structure(list(name = list("D")), .Names = "name", id = "D23")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")),
.Names = c("class","class"), id = "2"),
score = structure(list(
class = structure(list(name = list("017c"), people = list("1"), teacher = structure(list(name = list("E")), .Names = "name", id = "D15")), .Names = c("name", "people", "teacher"), id = "017", status = "-2"),
class = structure(list(name = list("019c"), people = list("22"), teacher = structure(list(name = list("F")), .Names = "name", id = "D28")), .Names = c("name", "people", "teacher"), id = "561", status = "-4"),
class = structure(list(name = list("562d"), people = list("28"), teacher = structure(list(name = list("G")), .Names = "name", id = "D21")), .Names = c("name", "people", "teacher"), id = "562", status = "-4")),
.Names = c("class", "class", "class"), id = "3")),
.Names = c("score", "score", "score"), date = "2017-01-25"),
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("011c"), people = list("80"), teacher = structure(list(name = list("H")), .Names = "name", id = "D47")), .Names = c("name", "people", "teacher"), id = "011", status = "-4"),
class = structure(list(name = list("013c"), people = list("37"), teacher = structure(list(name = list("I")), .Names = "name", id = "D18")), .Names = c("name", "people", "teacher"), id = "669", status = "-4"),
class = structure(list(name = list("751d"), people = list("15"), teacher = structure(list(name = list("J")), .Names = "name", id = "D61")), .Names = c("name", "people", "teacher"), id = "751", status = "-4")),
.Names = c("class", "class", "class"), id = "1"),
score = structure(list(
class = structure(list(name = list("015c"), people = list("29"), teacher = structure(list(name = list("K")), .Names = "name", id = "D13")), .Names = c("name", "people", "teacher"), id = "567", status = "-2"),
class = structure(list(name = list("666d"), people = list("14"), teacher = structure(list(name = list("L")), .Names = "name", id = "D16")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")),
.Names = c("class", "class"), id = "2"),
score = structure(list(
class = structure(list(name = list("015c"), people = list("21"), teacher = structure(list(name = list("M")), .Names = "name", id = "D22")), .Names = c("name", "people", "teacher"), id = "015", status = "-4"),
class = structure(list(name = list("602d"), people = list("18"), teacher = structure(list(name = list("N")), .Names = "name", id = "D10")), .Names = c("name", "people", "teacher"), id = "602", status = "-4")),
.Names = c("class", "class"), id = "3")),
.Names = c("score", "score", "score"), date = "2017-01-26"),
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("011c"), people = list("33"), teacher = structure(list(name = list("O")), .Names = "name", id = "D30")), .Names = c("name", "people", "teacher"), id = "011", status = "-4"),
class = structure(list(name = list("013c"), people = list("70"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "601", status = "-2"),
class = structure(list(name = list("603d"), people = list("0"), teacher = structure(list(name = list("P")), .Names = "name", id = "D27")), .Names = c("name", "people", "teacher"), id = "603", status = "-4")),
.Names = c("class", "class", "class"), id = "1"),
score = structure(list(
class = structure(list(name = list("011c"), people = list("56"), teacher = structure(list(name = list("H")), .Names = "name", id = "D47")), .Names = c("name", "people", "teacher"), id = "602", status = "-4"),
class = structure(list(name = list("666d"), people = list("8"), teacher = structure(list(name = list("Q")), .Names = "name", id = "D20")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")),
.Names = c("class", "class"), id = "2"),
score = structure(list(
class = structure(list(name = list("017c"), people = list("5"), teacher = structure(list(name = list("R")), .Names = "name", id = "D30")), .Names = c("name", "people", "teacher"), id = "017", status = "-4"),
class = structure(list(name = list("021c"), people = list("6"), teacher = structure(list(name = list("S")), .Names = "name", id = "D19")), .Names = c("name", "people", "teacher"), id = "561", status = "-4")),
.Names = c("class", "class"), id = "3")),
.Names = c("score", "score", "score"), date = "2017-01-27")),
.Names = c("schedule", "schedule", "schedule"), from = "2017-01-25", to = "2017-01-27")),
.Names = "scheduleList")
这是 z2 的一部分:
$scheduleList$schedule$score$class
$scheduleList$schedule$score$class$name
$scheduleList$schedule$score$class$name[[1]]
[1] "017C"
$scheduleList$schedule$score$class$people
$scheduleList$schedule$score$class$people[[1]]
[1] "5"
$scheduleList$schedule$score$class$teacher
$scheduleList$schedule$score$class$teacher$name
$scheduleList$schedule$score$class$teacher$name[[1]]
[1] "R"
attr(,"id")
[1] "D30"
attr(,"id")
[1] "017"
attr(,"status")
[1] "-4"
$scheduleList$schedule$score$class
$scheduleList$schedule$score$class$name
$scheduleList$schedule$score$class$name[[1]]
[1] "021C"
$scheduleList$schedule$score$class$people
$scheduleList$schedule$score$class$people[[1]]
[1] "6"
$scheduleList$schedule$score$class$teacher
$scheduleList$schedule$score$class$teacher$name
$scheduleList$schedule$score$class$teacher$name[[1]]
[1] "S"
attr(,"id")
[1] "D19"
attr(,"id")
[1] "561"
attr(,"status")
[1] "-4"
attr(,"id")
[1] "3"
attr(,"date")
[1] "2017-01-27"
attr(,"from")
[1] "2017-01-25"
attr(,"to")
[1] "2017-01-27"
我需要从嵌套列表中提取我需要的信息,因为我是新手,所以我使用了最低效的方法:
for (i in 1:length(z2[[1]])){ #length(z2[[1]])=7
for (j in 1:length(z2[[1]][[i]])){ #length(z2[[1]][[i]])=3
for (k in 1:length(z[[1]][[i]][[j]])){
cbind=(
Date=attr(z2[[1]][[i]],"date"), #date
Score=attr(z2[[1]][[i]][[j]],"id"), #score
People=z2[[1]][[i]][[j]][[k]][[2]][[1]], #people
TName=z2[[1]][[i]][[j]][[k]][[3]][[1]][[1]], #teacher name
TID=attr(z2[[1]][[i]][[j]][[k]][[3]],"id"), #teacher ID
CName=z2[[1]][[i]][[j]][[k]][[1]][[1]], #class name
CID=attr(z2[[1]][[i]][[j]][[k]],"id"), #class ID
CSta=attr(z2[[1]][[i]][[j]][[k]],"status") #class status
)
}
}
}
它在我的循环中不起作用。我想将它作为数据框或数组输出。我期待的结果:
Date Score TID TName CName CID CSta People
2017-01-25 1 D14 B 013c 602 -4 0
2017-01-26 2 D16 L 666d 666 -4 14
XML格式网站示例:
<result status="success">
<code>1</code>
<note>success</note>
<scheduleList from="2017-01-25" to="2017-01-26">
<schedule date="2017-01-25">
<score id="1">
<class id="011" status="-4">
<name>011c</name>
<people>116</people>
<teacher id="D47">
<name>A</name>
</teacher>
</class>
<class id="669" status="-4">
<name>669d</name>
<people>10</people>
<teacher id="D29">
<name>B</name>
</teacher>
</class>
</score>
<score id="2">
<class id="013" status="-4">
<name>013c</name>
<people>9</people>
<teacher id="D9">
<name>C</name>
</teacher>
</class>
</score>
<score id="3">
<class id="016" status="-4">
<name>016c</name>
<people>36</people>
<teacher id="D18">
<name>D</name>
</teacher>
</class>
<class id="019" status="-4">
<name>019c</name>
<people>9</people>
<teacher id="D30">
<name>E</name>
</teacher>
</class>
</score>
</schedule>
<schedule date="2017-01-26">
<score id="1">
<class id="011" status="-2">
<name>011c</name>
<people>2</people>
<teacher id="D29">
<name>F</name>
</teacher>
</class>
<class id="013" status="-2">
<name>013c</name>
<people>0</people>
<teacher id="D14">
<name>G</name>
</teacher>
</class>
</score>
<score id="2">
<class id="011" status="-2">
<name>011c</name>
<people>4</people>
<teacher id="D29">
<name>F</name>
</teacher>
</class>
</score>
<score id="3">
<class id="017" status="-2">
<name>017c</name>
<people>1</people>
<teacher id="D141">
<name>H</name>
</teacher>
</class>
<class id="019" status="-4">
<name>019c</name>
<people>22</people>
<teacher id="D291">
<name>I</name>
</teacher>
</class>
<class id="020" status="-4">
<name>020c</name>
<people>8</people>
<teacher id="D143">
<name>J</name>
</teacher>
</class>
</score>
</schedule>
</scheduleList>
</result>
代码:
url <- "xxxxxxx"
session <-html_session(url)
form <-html_form(read_html(url))[[1]]
filled_form <- set_values(form,
"fromDate" = "2017-01-25",
"toDate" = "2017-01-26",
"userid" = "xxx",
"Password" = "aaa")
s <- submit_form(session,filled_form)
z = read_xml(s$response)
【问题讨论】:
-
直接使用像
xml2这样的包,你可以从xml中提取而不转换为列表。你有网址或可复制的东西吗?我可以给你看。 -
@cderv 你好!我只是在上面列出的。我从网站复制了部分 xml 格式。是可复制的吗?像结果、代码和注释这样的 xml 外部是我不需要的。我肯定想向你学习:)
标签: r list dataframe nested extract