library(RCurl)
library(reshape)
library(htmltab)
library(ggplot2)
library(stringr)
library(scales)
library(lubridate)
#set these variables to TRUE/FALSE to toggle trend lines
daily.average = FALSE #bumpy daily average
smooth.average = FALSE #smoothed average using local regression
#get the tables from the url
theurl <- getURL("https://en.wikipedia.org/wiki/Nationwide_opinion_polling_for_the_United_States_presidential_election,_2016", ssl.verifyPeer=FALSE)
table1 <- htmltab(theurl, which=3)
table2 <- htmltab(theurl, which=4)
#"since convention nominations" table
table1 <- table1[, c(1, 2, 6, 3, 4)]
names(table1) <- c("Source", "Date", "Size", "DP", "RP")
#"Polls conducted in 2016" table
table2 <- table2[, c(1, 2, 8, 3:6)]
names(table2) <- c("Source", "Date", "Size", "DC", "DP", "RC", "RP")
table2 <- table2[which(table2$DC=="Hillary Clinton" & table2$RC=="Donald Trump"), c(1:3, 5, 7)]
#merge tables
df <- rbind(table1, table2)
names(df)[4:5] <- c("Clinton", "Trump")
#format numerical and date data
for (i in 4:5) {
df[[i]] <- as.numeric(sub("%", "", df[[i]]))/100
}
df$Size <- as.numeric(sub(",", "", df$Size))
df$Date <- sub("[0-9]+\\s*(–|-)\\s*([0-9]+)", "\\2", df$Date)
df$Date <- sub(".*(–|-)", "", df$Date)
df$Date <- trimws(df$Date)
df$Date <- as.Date(df$Date, format="%B %d, %Y")
#only keep polls as far as 3 months ago or after conventions
df <- df[which(df$Date >= min(max(df$Date)-months(3), as.Date("2016-07-28"))),]
#reshape data to have candidate and support as variable
mdata <- melt(df, id=c("Date", "Source", "Size"))
names(mdata)[4:5] <- c("Candidate", "Support")
colors <- c("#3333FF", "#FF3333")
labels <- c("Clinton", "Trump")
results <- mdata
#make plot
d <- ggplot(results, aes(x=Date, y=Support, colour=Candidate))
d <- d + geom_point(aes(size=Size), alpha=0.5)
#optional smooth average computation and display
if(smooth.average) {
d <- d + geom_smooth(aes(weight=Size), span=0.6, size=0.8, se=TRUE)
}
#optional daily average computation and display
if(daily.average) {
average <- function(dataframe, date, candidate) {
return(with(dataframe[which(dataframe$Date==as.Date(date) & dataframe$Candidate==candidate),], weighted.mean(Support, Size)))
}
dates <- unique(df$Date)
avg.results <- data.frame(Date=rep(dates, 2),
Candidate=c(
rep("Clinton", length(dates)),
rep("Trump", length(dates))
),
Support=c(
as.double(lapply(dates, function(x) average(results, x, "Clinton"))),
as.double(lapply(dates, function(x) average(results, x, "Trump")))
))
d <- d + geom_line(data=avg.results, size=0.8)
}
d <- d + scale_colour_manual(values = colors)
d <- d + labs(title="Nationwide opinion polling for the 2016 U.S. presidential election")
d <- d + scale_size_area(max_size=15,
breaks=c(1000, 2000, 4000, 8000, 16000),
labels=function(x) comma_format()(x),
name="Sample Size")
d <- d + scale_y_continuous(breaks=seq(0,1,0.05),
minor_breaks=seq(0,1,0.01),
labels=percent,
limits=c(0.3, 0.55))
d <- d + scale_x_date(labels=date_format("%b %d"),
breaks=date_breaks("weeks"),
minor_breaks=date_breaks("days"))
d <- d + theme(panel.grid.minor=element_line(size=0.2),
panel.grid.major=element_line(size=0.6))
#save plot as "ct.svg"
svg(filename="ct.svg",
width=9,
height=4,
pointsize=12,
bg="transparent")
d
dev.off()