【问题标题】:rvest trouble: POST submissionrvest 麻烦:POST 提交
【发布时间】:2016-09-15 23:51:21
【问题描述】:

我正在尝试使用 rvest 从 USGS 邮寄表格下载数据。我做错了什么?

make_url = function(base_url, parameter_list)
  parameter_list %>%
  names %>%
  paste(parameter_list, sep = "=", collapse = "&") %>%
  paste(base_url, ., sep = "")

session = 
  list(sn = "01170000") %>%
  make_url("http://ida.water.usgs.gov/ida/available_records.cfm?", .) %>%
  html_session

test = 
  session %>%
  html_form %>%
  .[[1]] %>%
  set_values(fromdate = "1990-10-01") %>%
  set_values(todate = "2007-09-30") %>%
  set_values(rtype = "3") %>%
  submit_form(session, .)

【问题讨论】:

    标签: r rvest


    【解决方案1】:

    不需要rvest 或会话。下面的函数将接受电台和日期,并返回一个数据框,其中包含 USGS 每次下载时吐出的数据文件注释。

    它使用“下载压缩文件”选项来节省带宽并加快下载速度。它会生成临时文件来读取数据,但会自行清理。列被转换为正确的类型(不过,如果你愿意,你可以省略那部分代码)。如果不需要,也可以省略附加的注释(它似乎对我有用)。

    readr::read_lines() 用于提高速度,如果您不想依赖 readr 包,可以使用 readLines()

    data.frame 转换为tibble 版本主要是为了更好地打印,但它还有其他潜在优势,因此如果您不想依赖tibble 包,也可以省略它。

    有一个硬编码的 99 秒超时,但您可以根据需要对其进行参数化。

    library(httr)
    library(readr)
    library(tibble)
    
    #' Retrieve IDA Station Data
    #'
    #' @param site_no site id
    #' @param date_from records from date YYYY-mm-dd
    #' @param date_to records to date YYYY-mm-dd
    #' @return a parsed, type-converted data frame with a comments attribute. 
    #' @example
    #' deerfield <- get_ida("01170000", "1990-10-01", "2007-09-30")
    #'
    #' head(deerfield)
    #'
    #' cat(comment(deerfield))
    
    get_ida <- function(site_no, date_from, date_to) {
    
      date_from_time <- sprintf("%s 00:15:00.0", date_from)
      date_to_time <- sprintf("%s 23:45:00.0", date_to)
    
      ida_referer <- sprintf("http://ida.water.usgs.gov/ida/available_records.cfm?sn=%s", site_no)
    
      tf <- tempfile(".zip")
    
      res <- POST(url = "http://ida.water.usgs.gov/ida/available_records_process.cfm",
                  body = list(fromdate = date_from,
                              todate = date_to,
                              mindatetime = date_from_time,
                              maxdatetime = date_to_time,
                              site_no = site_no,
                              rtype = "2",
                              submit1 = "Retrieve+Data"),
                  add_headers(Origin="http://ida.water.usgs.gov",
                              Referer=ida_referer),
                  write_disk(tf),
                  timeout(99),
                  encode = "form")
    
      fils <- unzip(tf, exdir=tempdir())
      tmp <- read_lines(fils)
    
      unlink(tf)
      unlink(fils)
    
      comments <- grep("^#", tmp, value=TRUE)
      records <- grep("^#", tmp, value=TRUE, invert=TRUE)
      header <- records[1:2]
      records <- records[-(1:2)]
      cols <- strsplit(header[1], "[[:space:]]+")[[1]]
    
      comments <- paste0(comments, collapse="\n")
      records <- paste0(records, collapse="\n")
    
      df <- read_tsv(records, col_names=cols, "cccnnnnc")
      df$date_time <- as.POSIXct(df$date_time, format="%Y%m%d%H%M%S")
      df <- as_tibble(df)
    
      comment(df) <- comments
    
      df
    
    }
    

    证明它有效:

    deerfield <- get_ida("01170000", "1990-10-01", "2007-09-30")
    
    dplyr::glimpse(deerfield)
    ## Observations: 550,917
    ## Variables: 8
    ## $ site_no     <chr> "01170000", "01170000", "01170000", "01170000", "0117000...
    ## $ date_time   <time> 1990-10-01 00:15:00, 1990-10-01 00:30:00, 1990-10-01 00...
    ## $ tz_cd       <chr> "EDT", "EDT", "EDT", "EDT", "EDT", "EDT", "EDT", "EDT", ...
    ## $ dd          <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,...
    ## $ accuracy_cd <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
    ## $ value       <dbl> 146, 139, 135, 143, 154, 166, 171, 175, 171, 166, 162, 1...
    ## $ prec        <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
    ## $ remark      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
    
    head(deerfield)
    ## # A tibble: 6 x 8
    ##    site_no           date_time tz_cd    dd accuracy_cd value  prec remark
    ##      <chr>              <time> <chr> <dbl>       <dbl> <dbl> <dbl>  <chr>
    ## 1 01170000 1990-10-01 00:15:00   EDT     7           1   146     3   <NA>
    ## 2 01170000 1990-10-01 00:30:00   EDT     7           1   139     3   <NA>
    ## 3 01170000 1990-10-01 00:45:00   EDT     7           1   135     3   <NA>
    ## 4 01170000 1990-10-01 01:00:00   EDT     7           1   143     3   <NA>
    ## 5 01170000 1990-10-01 01:15:00   EDT     7           1   154     3   <NA>
    ## 6 01170000 1990-10-01 01:30:00   EDT     7           1   166     3   <NA>
    
    cat(comment(deerfield))
    # retrieved: 2016-09-12 05:32:34 CST
    #
    # Data for the following station is contained in this file
    # ---------------------------------------------------------
    #  USGS 01170000 DEERFIELD RIVER NEAR WEST DEERFIELD, MA
    #
    # This data file was retrieved from the USGS
    # instantaneous data archive at
    # http://ida.water.usgs.gov
    #
    # ---------------------WARNING---------------------
    # The instantaneous data you have obtained from
    # this automated U.S. Geological Survey database
    # may or may not have been the basis for the published
    # daily mean discharges for this station. Although
    # automated filtering has been used to compare these
    # data to the published daily mean values and to remove
    # obviously bad data, there may still be significant
    # error in individual values. Users are strongly
    # encouraged to review all data carefully prior to use.
    # These data are released on the condition that neither
    # the USGS nor the United States Government may be held
    # liable for any damages resulting from its use.
    #
    # This file consists of tab-separated columns of the
    # following fields.
    #
    # column       column definition
    # -----------  -----------------------------------------
    # site_no      USGS site identification number
    # date_time     date and time in format (YYYYMMDDhhmmss)
    # tz_cd        time zone
    # dd           internal USGS sensor designation (''data descriptor'')
    # accuracy_cd  accuracy code
    #                   0 - A daily mean discharge calculated from the instantaneous
    #                       data on this day is 0.01 cubic feet per second
    #                       or less and the published daily mean is zero.
    #                   1 - A daily mean discharge calculated from the instantaneous
    #                       data on this day matches the published daily mean
    #                       within 1 percent.
    #                   2 - A daily mean discharge calculated from the instantaneous
    #                       data on this day matches the published daily mean
    #                       from greater than 1 to 5 percent.
    #                   3 - A daily mean discharge calculated from the instantaneous
    #                       values on this day matches the published daily mean
    #                       from greater than 5 to 10 percent.
    #                   9 - The instantaneous value is considered correct by the
    #                       collecting USGS Water Science Center. A published daily
    #                       mean value does not exist and/or no comparison was made.
    # value        discharge in cubic feet per second
    # precision    digits of precision in the discharge
    # remark       optional remark code
    #                 Remark  Explanation
    #                   <     Actual value is known to be less than reported value.
    #                   >     Actual value is known to be greater than reported value.
    #                   &     Value is affected by unspecified reasons.
    #                   A     Value is affected by ice at the measurement site.
    #                   B     Value is affected by backwater at the measurement site.
    #                   e     Value has been estimated by USGS personnel.
    #                   E     Value was computed from an estimated value.
    #                   F     Value was modified due to automated filtering.
    #                   K     Value is affected by instrument calibration drift.
    #                   R     Rating is undefined for this value.
    #
    #
    

    【讨论】:

    • 很好的答案!真的超越了。这似乎作为一个包的一部分可能会很好?我仍然想看看是否有办法让 rvest 工作。 hadleyverse 语法太好了。
    • 好吧,httr 技术上 是 hadleyverse 的一部分 ;-) 我想过同样的事情 w/r/t 潜在的 pkg,但 IDA 网站说所有数据最终将转移到 NWIS 站点,并且已经有一个 R pkg 可以访问该数据 - cran.r-project.org/web/packages/waterData/waterData.pdf - 但如果他们保持存档独立,那么我会考虑这样做。
    • 值得注意的是,这些数据已经与美国地质调查局的主要站点分离了至少两年。
    • 是的,但他们甚至在 IDA 主页上说正在迁移数据。
    • 好的,我想出了如何修复 rvest,关键是将 add_headers 参数作为 ... 添加到 submit_request 以便最终传递给 POST。我有两个问题:您如何确定这是正确的做法,这是 rvest 可以自动执行的操作吗?
    【解决方案2】:

    好的,这是让 rvest 发挥作用的一种方法:

    library(magrittr)
    
    make_url = function(base_url, parameter_list = list(), ...) {
      together_list = 
        parameter_list %>%
        c(list(...) )
    
      together_list %>%
        names %>%
        paste(together_list, sep = "=", collapse = "&") %>%
        paste(base_url, ., sep = "?")
    }
    
    download_ida = function(site_no, 
                            fromdate = "1990-10-01", 
                            todate = "2007-09-30", 
                            dir = ".",
                            filename = paste(site_no, "txt", sep = ".") ) {
    
      session = 
        "http://ida.water.usgs.gov/ida/available_records.cfm" %>%
        make_url(sn = "01170000") %>%
        html_session
    
      form = 
        session %>%
        html_form %>%
        .[[1]] %>%
        set_values(fromdate = fromdate,
                   todate = todate,
                   rtype = "2")
    
      tempfile = tempfile(".zip")
    
      submit_form(session, form, submit = NULL,
                  httr::write_disk(tempfile,
                                   overwrite = TRUE),
                  httr::add_headers(Referer = session$url) )
    
      filename = file.path(dir, filename)
    
      tempfile %>%
        unzip(exdir = dir) %>%
        file.rename(filename)
    
      filename
    }
    
    read_ida = function(filename) {
    
      col_names = 
        filename %>%
        readr::read_tsv(comment = "#", n_max = 1, col_names = FALSE)
    
      filename %>%
        readr::read_tsv(comment = "#", skip= 2, col_names = FALSE, na = "Ice",
                        col_types = cols(X2 = col_datetime(format = "%Y%m%d%H%M%S"))) %>%
        stats::setNames(col_names)
    }
    
    deerfield = 
      "01170000" %>%
      download_ida %>%
      read_ida
    

    但有一个警告:rvest 目前有一个开放的拉取请求,https://github.com/hadley/rvest/pull/161,这是让它工作所必需的。为此,有必要重新定义 submit_request 和 submit_form 以集成新的拉取请求:

    submit_request = function(form, submit = NULL) {
      is_submit <- function(x)
        if ( is.null(x$type) ) FALSE else
          tolower(x$type) %in% c("submit", "image", "button")
    
      submits <- Filter(is_submit, form$fields)
    
      if (length(submits) == 0) {
        stop("Could not find possible submission target.", call. = FALSE)
      }
      if (is.null(submit)) {
        submit <- names(submits)[[1]]
        message("Submitting with '", submit, "'")
      }
      if (!(submit %in% names(submits))) {
        stop("Unknown submission name '", submit, "'.\n", "Possible values: ", 
             paste0(names(submits), collapse = ", "), call. = FALSE)
      }
      other_submits <- setdiff(names(submits), submit)
      method <- form$method
      if (!(method %in% c("POST", "GET"))) {
        warning("Invalid method (", method, "), defaulting to GET", 
                call. = FALSE)
        method <- "GET"
      }
      url <- form$url
      fields <- form$fields
      fields <- Filter(function(x) length(x$value) > 0, fields)
      fields <- fields[setdiff(names(fields), other_submits)]
      values <- pluck(fields, "value")
      names(values) <- names(fields)
      list(method = method, encode = form$enctype, url = url, values = values)
    }
    
    submit_form = function(session, form, submit = NULL, ...) {
      request <- submit_request(form, submit)
      url <- xml2::url_absolute(form$url, session$url)
      if (request$method == "GET") {
        rvest:::request_GET(session, url = url, query = request$values, ...)
      } else if (request$method == "POST") {
        rvest:::request_POST(session, url = url, body = request$values, 
                             encode = request$encode, ...)
      } else {
        stop("Unknown method: ", request$method, call. = FALSE)
      }
    }
    

    希望拉取请求很快就会被合并。

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2019-06-14
      • 1970-01-01
      • 2011-08-20
      • 2019-01-31
      • 2010-11-11
      • 2011-06-12
      相关资源
      最近更新 更多