bramtayl bramtayl - 2 months ago 13
R Question

rvest trouble: POST submission

I'm trying download data from a USGS post form using rvest. What am I doing wrong?

make_url = function(base_url, parameter_list)
parameter_list %>%
names %>%
paste(parameter_list, sep = "=", collapse = "&") %>%
paste(base_url, ., sep = "")

session =
list(sn = "01170000") %>%
make_url("http://ida.water.usgs.gov/ida/available_records.cfm?", .) %>%
html_session

test =
session %>%
html_form %>%
.[[1]] %>%
set_values(fromdate = "1990-10-01") %>%
set_values(todate = "2007-09-30") %>%
set_values(rtype = "3") %>%
submit_form(session, .)

Answer

No need for either rvest or a session. The following function will take in station and dates and returns a data frame with the data file comment that the USGS spits out with each download.

It uses the "download compressed file" option to save bandwidth and speed up the download. It makes temporary files to read the data but cleans up after itself. Columns are converted to proper type (you can omit that part of the code if you want to, though). You can omit the comment attaching as well if you don't need it (it seemed to have useful information to me).

readr::read_lines() is used for speed and you can use readLines() instead if you don't want to rely on the readr package.

The conversion to a tibble version of a data.frame is mainly for better printing but it has other potential advantages, so you can omit that as well if you don't want to rely on the tibble package.

There's a hard-coded 99 second timeout but you can parameterize it if desired.

library(httr)
library(readr)
library(tibble)

#' Retrieve IDA Station Data
#'
#' @param site_no site id
#' @param date_from records from date YYYY-mm-dd
#' @param date_to records to date YYYY-mm-dd
#' @return a parsed, type-converted data frame with a comments attribute. 
#' @example
#' deerfield <- get_ida("01170000", "1990-10-01", "2007-09-30")
#'
#' head(deerfield)
#'
#' cat(comment(deerfield))

get_ida <- function(site_no, date_from, date_to) {

  date_from_time <- sprintf("%s 00:15:00.0", date_from)
  date_to_time <- sprintf("%s 23:45:00.0", date_to)

  ida_referer <- sprintf("http://ida.water.usgs.gov/ida/available_records.cfm?sn=%s", site_no)

  tf <- tempfile(".zip")

  res <- POST(url = "http://ida.water.usgs.gov/ida/available_records_process.cfm",
              body = list(fromdate = date_from,
                          todate = date_to,
                          mindatetime = date_from_time,
                          maxdatetime = date_to_time,
                          site_no = site_no,
                          rtype = "2",
                          submit1 = "Retrieve+Data"),
              add_headers(Origin="http://ida.water.usgs.gov",
                          Referer=ida_referer),
              write_disk(tf),
              timeout(99),
              encode = "form")

  fils <- unzip(tf, exdir=tempdir())
  tmp <- read_lines(fils)

  unlink(tf)
  unlink(fils)

  comments <- grep("^#", tmp, value=TRUE)
  records <- grep("^#", tmp, value=TRUE, invert=TRUE)
  header <- records[1:2]
  records <- records[-(1:2)]
  cols <- strsplit(header[1], "[[:space:]]+")[[1]]

  comments <- paste0(comments, collapse="\n")
  records <- paste0(records, collapse="\n")

  df <- read_tsv(records, col_names=cols, "cccnnnnc")
  df$date_time <- as.POSIXct(df$date_time, format="%Y%m%d%H%M%S")
  df <- as_tibble(df)

  comment(df) <- comments

  df

}

Proof it works:

deerfield <- get_ida("01170000", "1990-10-01", "2007-09-30")

dplyr::glimpse(deerfield)
## Observations: 550,917
## Variables: 8
## $ site_no     <chr> "01170000", "01170000", "01170000", "01170000", "0117000...
## $ date_time   <time> 1990-10-01 00:15:00, 1990-10-01 00:30:00, 1990-10-01 00...
## $ tz_cd       <chr> "EDT", "EDT", "EDT", "EDT", "EDT", "EDT", "EDT", "EDT", ...
## $ dd          <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,...
## $ accuracy_cd <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ value       <dbl> 146, 139, 135, 143, 154, 166, 171, 175, 171, 166, 162, 1...
## $ prec        <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ remark      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...

head(deerfield)
## # A tibble: 6 x 8
##    site_no           date_time tz_cd    dd accuracy_cd value  prec remark
##      <chr>              <time> <chr> <dbl>       <dbl> <dbl> <dbl>  <chr>
## 1 01170000 1990-10-01 00:15:00   EDT     7           1   146     3   <NA>
## 2 01170000 1990-10-01 00:30:00   EDT     7           1   139     3   <NA>
## 3 01170000 1990-10-01 00:45:00   EDT     7           1   135     3   <NA>
## 4 01170000 1990-10-01 01:00:00   EDT     7           1   143     3   <NA>
## 5 01170000 1990-10-01 01:15:00   EDT     7           1   154     3   <NA>
## 6 01170000 1990-10-01 01:30:00   EDT     7           1   166     3   <NA>

cat(comment(deerfield))
# retrieved: 2016-09-12 05:32:34 CST
#
# Data for the following station is contained in this file
# ---------------------------------------------------------
#  USGS 01170000 DEERFIELD RIVER NEAR WEST DEERFIELD, MA
#
# This data file was retrieved from the USGS
# instantaneous data archive at
# http://ida.water.usgs.gov
#
# ---------------------WARNING---------------------
# The instantaneous data you have obtained from
# this automated U.S. Geological Survey database
# may or may not have been the basis for the published
# daily mean discharges for this station. Although
# automated filtering has been used to compare these
# data to the published daily mean values and to remove
# obviously bad data, there may still be significant
# error in individual values. Users are strongly
# encouraged to review all data carefully prior to use.
# These data are released on the condition that neither
# the USGS nor the United States Government may be held
# liable for any damages resulting from its use.
#
# This file consists of tab-separated columns of the
# following fields.
#
# column       column definition
# -----------  -----------------------------------------
# site_no      USGS site identification number
# date_time     date and time in format (YYYYMMDDhhmmss)
# tz_cd        time zone
# dd           internal USGS sensor designation (''data descriptor'')
# accuracy_cd  accuracy code
#                   0 - A daily mean discharge calculated from the instantaneous
#                       data on this day is 0.01 cubic feet per second
#                       or less and the published daily mean is zero.
#                   1 - A daily mean discharge calculated from the instantaneous
#                       data on this day matches the published daily mean
#                       within 1 percent.
#                   2 - A daily mean discharge calculated from the instantaneous
#                       data on this day matches the published daily mean
#                       from greater than 1 to 5 percent.
#                   3 - A daily mean discharge calculated from the instantaneous
#                       values on this day matches the published daily mean
#                       from greater than 5 to 10 percent.
#                   9 - The instantaneous value is considered correct by the
#                       collecting USGS Water Science Center. A published daily
#                       mean value does not exist and/or no comparison was made.
# value        discharge in cubic feet per second
# precision    digits of precision in the discharge
# remark       optional remark code
#                 Remark  Explanation
#                   <     Actual value is known to be less than reported value.
#                   >     Actual value is known to be greater than reported value.
#                   &     Value is affected by unspecified reasons.
#                   A     Value is affected by ice at the measurement site.
#                   B     Value is affected by backwater at the measurement site.
#                   e     Value has been estimated by USGS personnel.
#                   E     Value was computed from an estimated value.
#                   F     Value was modified due to automated filtering.
#                   K     Value is affected by instrument calibration drift.
#                   R     Rating is undefined for this value.
#
#