概述
将httr::GET() 请求的内容从json 转换为r 对象需要一些数据操作步骤,然后才能导出为csv 文件。
可重现的示例
# install necessary packages
install.packages( pkgs = c( "httr", "jsonlite", "magrittr" ))
# load necessary packages
library( httr )
library( jsonlite )
library( magrittr )
# store query
query <- "http://api.erg.kcl.ac.uk/AirQuality/Information/MonitoringSiteSpecies/GroupName=London/Json"
# GET the query
out <- httr::GET( url = query )
# base method
# Convert content from raw bytes to character
contents.out.base <- base::rawToChar( x = out$content )
# examine the first 30 characters
# from the contents in JSON form
base::substr( x = contents.out.base
, start = 0
, stop = 30
)
# [1] "{\"Sites\":{\"Site\":[{\"@LocalAuth"
# transfrom from JSON string
# into a data frame
# set 'flatten' equal to TRUE
# to break out lists into individual columns
contents.out.base.df <-
jsonlite::fromJSON( txt = contents.out.base
, flatten = TRUE
)
# view the data
class( contents.out.base.df ) # [1] "list"
# Interesting! It didn't return a data frame
names( contents.out.base.df ) # [1] "Sites"
names( contents.out.base.df$Sites ) # [1] "Site"
names( contents.out.base.df$Sites$Site )
# [1] "@LocalAuthorityCode" "@LocalAuthorityName" "@SiteCode"
# [4] "@SiteName" "@SiteType" "@DateClosed"
# [7] "@DateOpened" "@Latitude" "@Longitude"
# [10] "@LatitudeWGS84" "@LongitudeWGS84" "@DataOwner"
# [13] "@DataManager" "@SiteLink" "Species"
# Note that 'Species' doesn't contain an '@' in front of it
# Why?
lapply( X = contents.out.base.df$Sites$Site, FUN = class )
# $`@LocalAuthorityCode`
# [1] "character"
#
# $`@LocalAuthorityName`
# [1] "character"
#
# $`@SiteCode`
# [1] "character"
#
# $`@SiteName`
# [1] "character"
#
# $`@SiteType`
# [1] "character"
#
# $`@DateClosed`
# [1] "character"
#
# $`@DateOpened`
# [1] "character"
#
# $`@Latitude`
# [1] "character"
#
# $`@Longitude`
# [1] "character"
#
# $`@LatitudeWGS84`
# [1] "character"
#
# $`@LongitudeWGS84`
# [1] "character"
#
# $`@DataOwner`
# [1] "character"
#
# $`@DataManager`
# [1] "character"
#
# $`@SiteLink`
# [1] "character"
#
# $Species
# [1] "list"
# save contents.out.base.df$Sites$Site as its own data frame
# without $Species
website.df <-
contents.out.base.df$Sites$Site[
, which( colnames( contents.out.base.df$Sites$Site ) != "Species" )
]
# check dim
dim( website.df ) # [1] 212 14
# view the first six rows
head( x = website.df )
#' @LocalAuthorityCode @LocalAuthorityName @SiteCode
#' 1 1 Barking and Dagenham BG3
#' 2 1 Barking and Dagenham BG1
#' 3 1 Barking and Dagenham BG2
#' 4 2 Barnet BN2
#' 5 2 Barnet BN3
#' 6 2 Barnet BN1
#' @SiteName @SiteType
#' 1 Barking and Dagenham - North Street Kerbside
#' 2 Barking and Dagenham - Rush Green Suburban
#' 3 Barking and Dagenham - Scrattons Farm Suburban
#' 4 Barnet - Finchley Urban Background
#' 5 Barnet - Strawberry Vale Urban Background
#' 6 Barnet - Tally Ho Corner Kerbside
#' @DateClosed @DateOpened @Latitude
#' 1 2011-05-25 00:00:00 2007-03-16 00:00:00 51.540444
#' 2 1999-11-02 00:00:00 51.563752
#' 3 1999-10-17 00:00:00 51.529389
#' 4 2012-04-20 00:00:00 2000-08-09 13:00:00 51.591901
#' 5 2002-05-15 00:00:00 2000-08-14 14:00:00 51.6008848453589
#' 6 2012-04-20 00:00:00 1998-12-20 12:00:00 51.614675
#' @Longitude @LatitudeWGS84 @LongitudeWGS84
#' 1 0.074418 6717454.5833 8284.17386585
#' 2 0.177891 6721627.34498 19802.7355367
#' 3 0.132857 6715476.18683 14789.5735883
#' 4 -0.205992 6726669.62886 -22930.9245475
#' 5 -0.172297542087178 6728279.54795 -19180.0746501
#' 6 -0.176607 6730751.38494 -19659.8013105
#' @DataOwner @DataManager
#' 1 Barking and Dagenham King's College London
#' 2 Barking and Dagenham King's College London
#' 3 Barking and Dagenham King's College London
#' 4 Barnet King's College London
#' 5 Barnet King's College London
#' 6 Barnet King's College London
#' @SiteLink
#' 1 http://www.londonair.org.uk/london/asp/publicdetails.asp?site=BG3
#' 2 http://www.londonair.org.uk/london/asp/publicdetails.asp?site=BG1
#' 3 http://www.londonair.org.uk/london/asp/publicdetails.asp?site=BG2
#' 4 http://www.londonair.org.uk/london/asp/publicdetails.asp?site=BN2
#' 5 http://www.londonair.org.uk/london/asp/publicdetails.asp?site=BN3
#' 6 http://www.londonair.org.uk/london/asp/publicdetails.asp?site=BN1
TL;DR 最终数据框将超过 212 行。
每个地方的一个地方当局能够识别多个物种。
这个one-to-many relationship 导致website.df 的最终版本被改造成'long' format,其中一个地方当局的信息可能会重复,因为他们在一个地点识别出不止一种物种。
为了将contents.out.base.df$Site$Sites$Species 中的每个对象与website.df 中的相应行组合起来,我使用了counter 对象。在使用lapply() 之前,counter 被设置为零值。
<<- 的使用 - super assignment operator - 允许我在每次调用 contents.out.base.df$Site$Sites$Species 中的新对象时提取相应的 website.df 行。阅读 Using a counter inside an apply structured loop in R 对学习如何正确执行此操作非常有帮助。
注意:使用cbind() 组合contents.out.base.df$Site$Sites$Species 中的对象会产生多个warnings()。 SO 帖子cbind warnings : row names were found from a short variable and have been discarded
显示cbind() 导致重复row names。为了防止重复的行名,它忽略了它们。
# create counter
# and set its value to zero
counter <- 0
# construct the column binding
# and replace the objects within the list
# with the concated version of
# that particular object's row in website.df
contents.out.base.df$Sites$Site$Species <-
lapply( X = contents.out.base.df$Sites$Site$Species
, FUN = function( i ){
# add to counter
counter <<- counter + 1
# add columns from
# the counter row in website.df
# onto the i object in X
cbind(
website.df[ counter , ]
, i
, stringsAsFactors = FALSE
)
} # end of anonymous function
)
# There were 50 or more warnings (use warnings() to see the first 50)
warnings()
# Warning messages:
# 1: In data.frame(..., check.names = FALSE) :
# row names were found from a short variable and have been discarded
将折叠列表导出为 CSV
将contents.out.base.df$Site$Sites$Species中的对象折叠成一个数据框后,我清理了website.df的行名和列名。最后,website.df 准备好使用write.csv() 函数导出到您的工作目录。
# collapse the individual objects
# in the list into one data frame
website.df <-
data.frame(
do.call( what = rbind
, args = contents.out.base.df$Sites$Site$Species
)
, stringsAsFactors = FALSE
)
# check dim
dim( website.df ) # [1] 524 18
# rename the rows
rownames( x = website.df ) <-
as.character( x = 1:nrow( x = website.df ) )
# Make syntactically valid column names
colnames( x = website.df ) <-
base::gsub( pattern = "X."
, replacement = ""
, x = colnames( website.df )
)
# view the first six rows
head( x = website.df )
# LocalAuthorityCode LocalAuthorityName SiteCode
# 1 1 Barking and Dagenham BG3
# 2 1 Barking and Dagenham BG1
# 3 1 Barking and Dagenham BG1
# 4 1 Barking and Dagenham BG2
# 5 1 Barking and Dagenham BG2
# 6 2 Barnet BN2
# SiteName SiteType
# 1 Barking and Dagenham - North Street Kerbside
# 2 Barking and Dagenham - Rush Green Suburban
# 3 Barking and Dagenham - Rush Green Suburban
# 4 Barking and Dagenham - Scrattons Farm Suburban
# 5 Barking and Dagenham - Scrattons Farm Suburban
# 6 Barnet - Finchley Urban Background
# DateClosed DateOpened Latitude Longitude
# 1 2011-05-25 00:00:00 2007-03-16 00:00:00 51.540444 0.074418
# 2 1999-11-02 00:00:00 51.563752 0.177891
# 3 1999-11-02 00:00:00 51.563752 0.177891
# 4 1999-10-17 00:00:00 51.529389 0.132857
# 5 1999-10-17 00:00:00 51.529389 0.132857
# 6 2012-04-20 00:00:00 2000-08-09 13:00:00 51.591901 -0.205992
# LatitudeWGS84 LongitudeWGS84 DataOwner
# 1 6717454.5833 8284.17386585 Barking and Dagenham
# 2 6721627.34498 19802.7355367 Barking and Dagenham
# 3 6721627.34498 19802.7355367 Barking and Dagenham
# 4 6715476.18683 14789.5735883 Barking and Dagenham
# 5 6715476.18683 14789.5735883 Barking and Dagenham
# 6 6726669.62886 -22930.9245475 Barnet
# DataManager
# 1 King's College London
# 2 King's College London
# 3 King's College London
# 4 King's College London
# 5 King's College London
# 6 King's College London
# SiteLink
# 1 http://www.londonair.org.uk/london/asp/publicdetails.asp?site=BG3
# 2 http://www.londonair.org.uk/london/asp/publicdetails.asp?site=BG1
# 3 http://www.londonair.org.uk/london/asp/publicdetails.asp?site=BG1
# 4 http://www.londonair.org.uk/london/asp/publicdetails.asp?site=BG2
# 5 http://www.londonair.org.uk/london/asp/publicdetails.asp?site=BG2
# 6 http://www.londonair.org.uk/london/asp/publicdetails.asp?site=BN2
# SpeciesCode SpeciesDescription DateMeasurementStarted
# 1 NO2 Nitrogen Dioxide 2008-01-01 00:00:00
# 2 NO2 Nitrogen Dioxide 2008-01-01 00:00:00
# 3 SO2 Sulphur Dioxide 1999-10-23 00:00:00
# 4 NO2 Nitrogen Dioxide 2007-11-21 00:00:00
# 5 PM10 PM10 Particulate 1999-10-17 00:00:00
# 6 NO2 Nitrogen Dioxide 2008-01-01 00:00:00
# DateMeasurementFinished
# 1 2011-05-25 00:00:00
# 2
# 3
# 4
# 5
# 6 2012-04-20 00:00:00
# Export as CSV
write.csv( x = website.df
, file = "web_scrape.csv"
, row.names = FALSE
)
# end of script #
会话信息
使用sessionInfo()。
R version 3.4.3 (2017-11-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.2
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] magrittr_1.5 jsonlite_1.5 httr_1.3.1
loaded via a namespace (and not attached):
[1] compiler_3.4.3 R6_2.2.2 rgdal_1.2-16 tools_3.4.3
[5] sp_1.2-7 curl_3.1 yaml_2.1.16 grid_3.4.3
[9] lattice_0.20-35