rm(list=ls())
#Load the complete HTML file into memory
html <- readLines(url("http://en.wikipedia.org/wiki/Opinion_polling_for_the_New_Zealand_general_election,_2008"),encoding="UTF-8")
closeAllConnections()
#The third table is the opinion poll data
tbl <- html[(grep("<table.*",html)[3]):(grep("</table.*",html)[3])]
#Now split it into the rows, based on the <tr> tag
tbl.rows <- split(tbl,cumsum(tbl=="<tr>"))
#Now extract the data
survey.dat <- lapply(tbl.rows,function(x) { #Start by only considering where we have <td> tags td.tags <- x[grep("<td",x)] #Polling data appears in columns 3-10 dat <- td.tags[3:10] #Now strip the data and covert to numeric format dat <- gsub("<td>|</td>","",dat) dat <- gsub("%","",dat) dat <- gsub("-","0",dat) dat <- as.numeric(dat) #Getting the date strings is a little harder. The approach we will take is to take advantage #of the title="date" hyperlinks to generate a set of dates date.str <- td.tags[2] #Dates are in the second column date.str <- gsub("<sup.*</sup>","",date.str) #Throw out anything between superscript tags, as its an reference to the source titles <- gregexpr("(?U)title=\".*\"",date.str,perl=TRUE)[[1]] #Find the location of the title tags #Now, extract the actual date strings date.strings <- rep(NULL,length(titles)) for(i in 1:length(titles)) { date.strings[i] <- substr(date.str,titles[i]+7,titles[i]+attr(titles,"match.length")[i]-2) } yr <- rev(date.strings)[1] dates <- rep(as.POSIXct(Sys.time()),length(date.strings)-1) for(i in 1:(length(date.strings)-1)) { dates[i] <- as.POSIXct(strptime(paste(date.strings[i],yr),"%B %d %Y")) } survey.time <- mean(dates) #Get the name of the survey company too survey.comp <- td.tags[1] survey.comp <- gsub("<sup.*</sup>","",survey.comp) survey.comp <- gsub("<td>|</td>","",survey.comp) survey.comp <- gsub("<U+2013>","-",survey.comp,fixed=TRUE) survey.comp <- gsub("(?U)<.*>","",survey.comp,perl=TRUE) #And now return results return(data.frame(Company=survey.comp,Date=survey.time,t(dat)))
})
#Combine results
surveys <- do.call(rbind,survey.dat)
colnames(surveys) <- c("Company","Date","Labour","National","NZ First","Maori Party","Greens","ACT","United Future","Progressive")
#Restrict plot(manually) to parties which have been over 5%
parties <- c("Greens","Labour","National","NZ First")
cols <- c("darkgreen","red","blue","black")
polls <- surveys[,c("Company","Date",parties)]
polls <- subset(polls,!is.na(surveys$Date))
polls <- polls[order(polls$Date),]
polls$Date <- as.double(polls$Date)
ticks <- ISOdate(c(2005,rep(2006,3),rep(2007,3),rep(2008,3)),c(9,rep(c(1,5,9),3)),1)
xlims <- range(as.double(c(ticks,ISOdate(2009,4,1))))
png("NZ_opinion_polls_2005-2008 -parties.png",width=778,height=487,pointsize=16)
par(mar=c(3,4,1,1))
matplot(polls$Date,polls[,parties],pch=NA,xlim=xlims,ylab="Party support (%)",xlab="",col=cols,xaxt="n",ylim=c(0,60))
abline(h=seq(0,95,by=5),col="lightgrey",lty=3)
abline(v=as.double(ticks),col="lightgrey",lty=3)
#Now add loess smoothers
smoothed <- list()
for(i in 1:length(parties)) { smoother <- loess(polls[,parties[i]] ~ polls[,"Date"],span=0.25) smoothed[[i]] <- predict(smoother,se=TRUE) polygon(c(polls[,"Date"],rev(polls[,"Date"])), c(smoothed[[i]]$fit+smoothed[[i]]$se.fit*1.96,rev(smoothed[[i]]$fit-smoothed[[i]]$se.fit*1.96)), col=rgb(0.5,0.5,0.5,0.5),border=NA)
}
names(smoothed) <- parties
for(i in 1:length(parties)) { lines(polls[,"Date"],smoothed[[i]]$fit,col=cols[i],lwd=2)
}
matpoints(polls$Date,polls[,parties],pch=20,col=cols)
legend("topleft",legend=parties,col=cols,pch=20,bg="white",lwd=2)
axis(1,at=as.double(ticks),labels=format(ticks,format="%b\n%Y"),cex.axis=0.8)
axis(4,at=axTicks(4),labels=rep("",length(axTicks(4))))
#Add best estimates
for(i in 1:length(smoothed)) { lbl <- sprintf("%4.1f%% ± %2.1f",round(rev(smoothed[[i]]$fit)[1],1),round(1.96*rev(smoothed[[i]]$se.fit)[1],1)) text(rev(polls$Date)[1],rev(smoothed[[i]]$fit)[1],labels=lbl,pos=4,col=cols[i])
}
dev.off()
#As a cross validation, print the rows where there are NA's
checks <- subset(surveys,apply(surveys,1,function(x) any(is.na(x))))
print(checks)