Rilcon42 Rilcon42 - 3 months ago 10
HTML Question

readHTMLTable returns dataframe values as integers

Can someone explain why the marked line below returns numbers instead of setting the column names the strings in that row? How would I get the correct column names if I include the commented out

colClasses
line when reading the table?

url<-'http://qpublic7.qpublic.net/ga_subdivison.php?county=ga_clarke&searchType=nbhd&numberValue=4025R&nameValue=&sectionValue=&townshipValue=&rangeValue=&startDate=01-1998&endDate=&startPrice=&endPrice=&startArea=&endArea=&startAcreage=&endAcreage=&saleQualification=All&saleVacant=All&propertyType=All&reasonType=All&start=0'
library(XML)
#colClasses = c("character","character","character","character","integer","integer","integer","character","character","integer","character","character"),
data<-readHTMLTable(url,header=F,as.data.frame=T)
View(data)
csv<-as.data.frame(data)
colnames(csv)<-csv[4,] #why does this line return numbers?
colnames(csv)<-gsub(" ","",colnames(csv))
View(head(csv))
csv<-csv[-c(1:4),]
#####
View(csv)

Answer

You were bit by stringsAsFactors. Plus, your as.data.frame call was not the right thing to do (look at the data structure with str vs View).

library(XML)

URL <- 'http://qpublic7.qpublic.net/ga_subdivison.php?county=ga_clarke&searchType=nbhd&numberValue=4025R&nameValue=&sectionValue=&townshipValue=&rangeValue=&startDate=01-1998&endDate=&startPrice=&endPrice=&startArea=&endArea=&startAcreage=&endAcreage=&saleQualification=All&saleVacant=All&propertyType=All&reasonType=All&start=0'

csv <- readHTMLTable(URL, header=FALSE, as.data.frame=TRUE, stringsAsFactors=FALSE)[[2]]

colnames(csv) <- csv[4,]
colnames(csv) <- gsub(" ", "", colnames(csv))

csv <- csv[-c(1:4),]

dplyr::glimpse(csv)

## Observations: 52
## Variables: 11
## $ \/ParcelNumber\/ (chr) "173C2 F023", "173C2 G009", "173C2 G007", "17...
## $ PropertyType       (chr) "R", "R", "R", "R", "R", "R", "R", "R", "R"...
## $ SaleDate           (chr) "12-2015", "08-2015", "08-2015", "07-2015",...
## $ SalePrice          (chr) "200,000", "265,000", "210,000", "188,000",...
## $ HeatedSqFt         (chr) "1,538", "1,756", "1,415", "1,125", "1,559"...
## $ Acres              (chr) "0.30", "0.37", "0.37", "0.38", "0.32", "0....
## $ SaleQual           (chr) "Q", "Q", "Q", "Q", "Q", "Q", "U", "Q", "Q"...
## $ Reason             (chr) "FM", "FM", "FM", "FM", "FM", "FM", "B", "F...
## $ YearBuilt          (chr) "1952", "1954", "1963", "1963", "1998", "19...
## $ LocationAddress    (chr) "155 HARDIN DR", "140 HARDIN DR", "150 HARD...
## $ Neighborhood       (chr) "4025R-RIVERDALE", "4025R-RIVERDALE", "4025...


# or use the more modern xml2 ---------------------------------------------

library(xml2)
library(rvest)

pg <- read_html(URL)

csv2 <- html_table(html_nodes(pg, "table")[[2]], fill=TRUE)

colnames(csv2) <- csv[4,]
colnames(csv2) <- gsub(" ", "", colnames(csv))

csv2 <- csv2[-c(1:4), -c(12:13)]

dplyr::glimpse(csv2)

## Observations: 52
## Variables: 11
## $ \/ParcelNumber\/ (chr) "173C2 F023 ", "173C2 G009 ", "173C2 G007 ", ...
## $ PropertyType       (chr) "R ", "R ", "R ", "R ", "R ", "R ", "R ", "...
## $ SaleDate           (chr) "12-2015 ", "08-2015 ", "08-2015 ", "07-201...
## $ SalePrice          (chr) "200,000 ", "265,000 ", "210,000 ", "188,00...
## $ HeatedSqFt         (chr) "1,538 ", "1,756 ", "1,415 ", "1,125 ", "1,...
## $ Acres              (chr) "0.30 ", "0.37 ", "0.37 ", "0.38 ", "0.32 "...
## $ SaleQual           (chr) "Q ", "Q ", "Q ", "Q ", "Q ", "Q ", "U ", "...
## $ Reason             (chr) "FM ", "FM ", "FM ", "FM ", "FM ", "FM ", "...
## $ YearBuilt          (chr) "1952 ", "1954 ", "1963 ", "1963 ", "1998 "...
## $ LocationAddress    (chr) "155 HARDIN DR ", "140 HARDIN DR ", "150 HA...
## $ Neighborhood       (chr) "4025R-RIVERDALE ", "4025R-RIVERDALE ", "40...