您可以开始迭代可以使用div.search-result .line 选择的行,然后:
- 使用
div:first-child h3 获取名称
- 使用
div:first-child p获取序数
- 通过迭代
div:nth-child(2) p 来获取位置,因为可以有多个位置(您的页面上有 5 个位置)并将它们存储在一个列表中
必须使用gsub("[\t\n]", "", x) 作为名称和序号来删除制表符和新行。对于地址,您可以获取文本并根据新行 \n 进行拆分,删除重复的新行并剥离第一行和最后一行以获得如下列表:
[1] "CABINET VÉTÉRINAIRE DV FEYS JEAN-MARC"
[2] "Cabinet Veterinaire"
[3] "ZA de Kercadiou"
[4] "XXXXX"
[5] "LANVOLLON"
[6] "Tél : 0X.XX.XX.XX.XX"
以下代码还将向量列表转换为包含该页面上所有数据的数据框:
library(rvest)
library(plyr)
url = "https://www.veterinaire.fr/annuaires/trouver-un-veterinaire-pour-soigner-mon-animal.html?tx_siteveterinaire_general%5B__referrer%5D%5B%40extension%5D=SiteVeterinaire&tx_siteveterinaire_general%5B__referrer%5D%5B%40vendor%5D=SiteVeterinaire&tx_siteveterinaire_general%5B__referrer%5D%5B%40controller%5D=FrontendUser&tx_siteveterinaire_general%5B__referrer%5D%5B%40action%5D=search&tx_siteveterinaire_general%5B__referrer%5D%5Barguments%5D=YToxOntzOjY6InNlYXJjaCI7YTo1OntzOjM6Im5vbSI7czowOiIiO3M6NjoicmVnaW9uIjtzOjA6IiI7czoxMToiZGVwYXJ0ZW1lbnQiO3M6MDoiIjtzOjU6InZpbGxlIjtzOjA6IiI7czoxMjoiaXRlbXNQZXJQYWdlIjtzOjI6IjEwIjt9fQ%3D%3D21a1899f9a133814dfc1eb4e01b3b47913bd9925&tx_siteveterinaire_general%5B__referrer%5D%5B%40request%5D=a%3A4%3A%7Bs%3A10%3A%22%40extension%22%3Bs%3A15%3A%22SiteVeterinaire%22%3Bs%3A11%3A%22%40controller%22%3Bs%3A12%3A%22FrontendUser%22%3Bs%3A7%3A%22%40action%22%3Bs%3A6%3A%22search%22%3Bs%3A7%3A%22%40vendor%22%3Bs%3A15%3A%22SiteVeterinaire%22%3B%7D7cd75ca141359a98763248c24da8103293a53d08&tx_siteveterinaire_general%5B__trustedProperties%5D=a%3A1%3A%7Bs%3A6%3A%22search%22%3Ba%3A5%3A%7Bs%3A3%3A%22nom%22%3Bi%3A1%3Bs%3A6%3A%22region%22%3Bi%3A1%3Bs%3A11%3A%22departement%22%3Bi%3A1%3Bs%3A5%3A%22ville%22%3Bi%3A1%3Bs%3A12%3A%22itemsPerPage%22%3Bi%3A1%3B%7D%7D86c9510d17c093c44d053714ab20567929a45f9d&tx_siteveterinaire_general%5Bsearch%5D%5Bnom%5D=&tx_siteveterinaire_general%5Bsearch%5D%5Bregion%5D=&tx_siteveterinaire_general%5Bsearch%5D%5Bdepartement%5D=&tx_siteveterinaire_general%5Bsearch%5D%5Bville%5D=&tx_siteveterinaire_general%5Bsearch%5D%5BitemsPerPage%5D=100&tx_siteveterinaire_general%5B%40widget_0%5D%5BcurrentPage%5D=127&cHash=8d8dc78e004b4b9d0ecfdf9b884f54ca"
rows <- read_html(url) %>%
html_nodes("div.search-result .line")
strip <- function (x) gsub("[\t\n]", "", x)
i <- 1
data = list()
for(r in rows){
addresses = list()
j <- 1
locations = r %>% html_nodes("div:nth-child(2) p")
for(loc in locations){
addresses[[j]] <- loc %>% html_text() %>%
gsub("[\t]", "", .) %>% #remove tabs
gsub('([\n])\\1+', '\\1', .) %>% #remove duplicate \n
gsub('^\n|\n$', '', .) %>% #remove 1st and last \n
strsplit(., split='\n', fixed=TRUE) #split by \n
j <- j + 1
}
data[[i]] <- c(
name = r %>% html_nodes("div:first-child h3") %>% html_text() %>% strip(.),
ordinal = r %>% html_nodes("div:first-child p") %>% html_text() %>% strip(.),
addresses = addresses
)
i <- i + 1
}
df = rbind.fill(lapply(data,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))
#show data
print(df)
for(i in 1:3){
print(paste("name",df[i,"name"]))
print(paste("ordinal",df[i,"ordinal"]))
print(paste("addresses",df[i,"addresses"]))
print(paste("addresses1",df[i,"addresses1"]))
print(paste("addresses2",df[i,"addresses2"]))
print(paste("addresses3",df[i,"addresses3"]))
}