【问题标题】:How do I collect historical weather data from Wunderground using R function?如何使用 R 函数从 Wunderground 收集历史天气数据?
【发布时间】:2016-10-09 20:57:59
【问题描述】:

谁能帮我修复这个代码? 我去年使用的脚本没有问题,但现在与 url 的连接存在问题。

我该如何解决?

我想要的是从气象站“EKAH”(Tirstrup,丹麦奥胡斯机场)收集和整理 2015 年 12 月 1 日至 2016 年 4 月 15 日的数据。

    ############## 1) Run function --------------------

    wunder_station_daily <- function(station, date)
    {
      base_url <- 'https://www.wunderground.com/history/airport'

      # Example website: https://www.wunderground.com/history/airport/EKAH/2016/06/09/DailyHistory.html?&MR=1

      # parse date
      m <- as.integer(format(date, '%m'))
      d <- as.integer(format(date, '%d'))
      y <- format(date, '%Y')

      # compose final url
      final_url <- paste(base_url,
                         '/', station,
                         '/', y,
                         '/', m, 
                         '/', d,
                         '/DailyHistory.html?&MR=1', sep='')

      # reading in as raw lines from the web server
      # contains <br> tags on every other line

      #   u <- url(final_url)
      #   the_data <- readLines(u)
      #   close(u)

      the_data <- getURL(final_url, ssl.verifypeer=0L, followlocation=1L)

      # only keep records with more than 5 rows of data
      if(length(the_data) > 5 )
      {
        # remove the first and last lines
        the_data <- the_data[-c(1, length(the_data))]

        # remove odd numbers starting from 3 --> end
        the_data <- the_data[-seq(3, length(the_data), by=2)]

        # extract header and cleanup
        the_header <- the_data[1]
        the_header <- make.names(strsplit(the_header, ',')[[1]])

        # convert to CSV, without header
        tC <- textConnection(paste(the_data, collapse='\n'))
        the_data <- read.csv(tC, as.is=TRUE, row.names=NULL, header=FALSE, skip=1)
        close(tC)

        # remove the last column, created by trailing comma
        the_data <- the_data[, -ncol(the_data)]

        # assign column names
        names(the_data) <- the_header

        # convert Time column into properly encoded date time
        the_data$Time <- as.POSIXct(strptime(the_data$Time, format='%Y-%m-%d %H:%M:%S'))

        # remove UTC and software type columns
        the_data$DateUTC.br. <- NULL
        the_data$SoftwareType <- NULL

        # sort and fix rownames
        the_data <- the_data[order(the_data$Time), ]
        row.names(the_data) <- 1:nrow(the_data)

        # done
        return(the_data)
      }
    }


    ############## 2) Get data for a range of dates ------------------------------


    date.range <- seq.Date(from=as.Date('2015-12-01'), to=as.Date('2015-12-04'), by='1 day')
    station <- 'EKAH'


    # pre-allocate list
    l <- vector(mode='list', length=length(date.range))

    # loop over dates, and fetch data
    for(i in seq_along(date.range))
    {
      print(paste0("Fetching data: ", date.range[i]))
      l[[i]] <- wunder_station_daily('EKAH', date.range[i])
    }

    # stack elements of list into DF, filling missing columns with NA
    d <- ldply(l)

【问题讨论】:

    标签: r url weather collect geturl


    【解决方案1】:

    他们稍微更改了 URL,虽然我认为我可以为新的 URL 替换掉错误的 URL,但这是大部分代码的现代化版本(我做了 99% 的清理工作):

    #' @param station station name
    #' @param wx_date Date object or character string 
    #' @param fmt if wx_date is not a Date object and the character string
    #'        is not in "%Y-%m-%d" format, then specify the format here
    #' @return data.frame of redings
    get_wx <- function(station="EKAH", wx_date=Sys.Date(), fmt="%Y-%m-%d") {
    
      require(httr)
      require(readr)
    
      if (inherits(wx_date, "character")) {
        wx_date <- as.Date(wx_date, fmt)
      }
    
      wx_base_url <- "https://www.wunderground.com/history/airport/%s/%s/DailyHistory.html"
      wx_url <- sprintf(wx_base_url, station, format(wx_date, "%Y/%m/%d"))
    
      res <- httr::GET(wx_url, query=list(MR=1, format=1))
      dat <- httr::content(res, as="text")
    
      dat <- gsub("<br />", "", dat)
      dat <- read.table(text=dat, sep=",", header=TRUE,
                        na.strings=c("-", "N/A", "NA"), stringsAsFactors=FALSE)
    
      # saner column names
    
      cols <- colnames(dat)
    
      # via http://stackoverflow.com/a/22528880/1457051
      cols <- gsub("([a-z])([A-Z])", "\\1_\\L\\2", cols, perl=TRUE)
      cols <- sub("^(_[a-z])", "\\L\\1", cols, perl=TRUE)
      cols <- tolower(gsub("\\.", "_", cols))
    
      readr::type_convert(setNames(dat, cols)) # more robust than type.convert()
    
    }
    
    tdy <- get_wx()
    
    str(tdy)
    ## 'data.frame': 36 obs. of  14 variables:
    ##  $ time_cest            : chr  "12:00 AM" "12:20 AM" "12:50 AM" "1:00 AM" ...
    ##  $ temperature_f        : num  51 50 48.2 47 46.4 44.6 44 44.6 44.6 44 ...
    ##  $ dew_point_f          : num  41 41 39.2 39 39.2 39.2 38 39.2 39.2 38 ...
    ##  $ humidity             : int  60 71 71 67 76 81 71 81 81 73 ...
    ##  $ sea_level_pressure_in: num  30.1 30.1 30.1 30.1 30.1 ...
    ##  $ visibility_mph       : num  28 6.2 6.2 28 6.2 6.2 7 6.2 6.2 28 ...
    ##  $ wind_direction       : chr  "WNW" "West" "West" "West" ...
    ##  $ wind_speed_mph       : chr  "2.3" "2.3" "2.3" "2.3" ...
    ##  $ gust_speed_mph       : logi  NA NA NA NA NA NA ...
    ##  $ precipitation_in     : logi  NA NA NA NA NA NA ...
    ##  $ events               : logi  NA NA NA NA NA NA ...
    ##  $ conditions           : chr  NA "Unknown" "Unknown" NA ...
    ##  $ wind_dir_degrees     : int  300 270 270 270 270 270 270 280 280 270 ...
    ##  $ date_utc             : POSIXct, format: "2016-06-08 22:00:00" "2016-06-08 22:20:00" ...
    
    a_yr_ago <- get_wx(wx_date="2015-06-09")
    

    如果需要,您可以将"" 添加到na.strings 向量中。

    而且,这是将一系列日期的读数输入data.frame 的另一种方法:

    library(purrr)
    
    rng <- map_df(seq(as.Date("2015-12-01"), as.Date("2015-12-04"), "1 day"),
                  function(x) { get_wx(wx_date=x) })
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2017-02-15
      • 2018-01-02
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2013-11-27
      • 1970-01-01
      相关资源
      最近更新 更多