# Place this .R file together with the two data files in a directory. # Change the current directory in R to that directory (e.g. focus on R Console File->ChangeDir) # Open the two relevant results files as R data frames #http://www.athenryac.com/sites/www.athenryac.com/files/FullResult.txt res08=read.delim("FullResult.txt",sep="\t",header=TRUE) #http://www.athenryac.com/sites/www.athenryac.com/files/10foa10km_provisional.txt res11=read.delim("10foa10km_provisional.txt",sep="\t",skip=1,header=TRUE) # Already we can see a difference, the second file has a title line # above the header describing the columns, which I skipped with the "skip=1" argument # We can inspect the contents of these files by executing "edit(res11)" for example, # or executing "edit(head(res11))" to just see the first few lines. # The first dataset doesn't have a sex column, but we can extract that information # from the txtCat2 column and add it to the data frame ourselves res08$Sex=substr(res08$txtCat2,1,1) # Now we can convert the time strings in both data frames into R time objects res08$FinishTime=as.numeric(as.difftime(as.character(res08$txtTime),format="%H:%M:%S")) res11$FinishTime=as.numeric(as.difftime(as.character(res11$Chip.Time),format="%H:%M:%S")) # We can find out how many people ran, and had their time recorded in each year length(res08$FinishTime[!is.na(res08$FinishTime)]) length(res11$FinishTime[!is.na(res11$FinishTime)]) # Now we look at MY time. These are my bib numbers for the two years bib08=737 bib11=650 # We can filter out my data from the two data frames runner08=res08[res08$txtEntrantNo==bib08,] runner11=res11[res11$Bib==bib11,] # We can generate summary tables for the finish times and compare with my times summary(res08$FinishTime) runner08$FinishTime summary(res11$FinishTime) runner11$FinishTime png("FinishDist.png",width=700,height=400) # We can look at the distribution of finishing times plot(density(res08$FinishTime,na.rm=TRUE),lwd=3,xlab="Finish Time (mins)", main="Fields of Athenry 10k Results") points(density(res11$FinishTime,na.rm=TRUE),type="l",lwd=3,col="red") legend("topright",c("Dec 2008","Jan 2011"),col=c("black","red"),lwd=2.5) # And indicate my finish times on the plot abline(v=runner08$FinishTime, lwd="2.5", lty="dashed") abline(v=runner11$FinishTime,lwd="2.5", lty="dotted",col="red") dev.off() # We can ask ourselves if the finishing times in 2011 were generally faster than those in 2008 # by performing a non-parametric Mann-Witney test for differences in median finishing time. # This test doesn't assume any particular shape for distributions (they are obviously non-normal). # Another alternative is to carry out a t-test for differences in the mean finishing times. I don't like this # as it assumes normal distributions. Interestingly the two computed p-values are very different (though # neither suggest a significant difference between years) wilcox.test(res08$FinishTime,res11$FinishTime,conf.int=TRUE) t.test(res08$FinishTime,res11$FinishTime,conf.int=TRUE) # There are many other comparisons we can make, for instance we can compare mens times with womens times png("SexCompare.png",width=700,height=400) plot(density(res08$FinishTime[res08$Sex=="M"],na.rm=TRUE),lwd=3,xlab="Finish Time (mins)", main="Fields of Athenry 10k Results") points(density(res08$FinishTime[res08$Sex!="M"],na.rm=TRUE),lwd=3,type="l",lty=2) points(density(res11$FinishTime[res11$Sex=="M"],na.rm=TRUE),lwd=3,type="l",col="red",) points(density(res11$FinishTime[res11$Sex!="M"],na.rm=TRUE),lwd=3,type="l",col="red",lty=2) legend("topright",c("Men Dec 2008","Women Dec 2008","Men Jan 2011","Women Jan 2011"),col=c("black","black","red","red"),lwd=2.5,lty=c(1,2,1,2)) dev.off() # And look at summaries for these summary(res08$FinishTime[res08$Sex=="M"]) summary(res08$FinishTime[res08$Sex!="M"]) summary(res11$FinishTime[res11$Sex=="M"]) summary(res11$FinishTime[res11$Sex!="M"]) # And test for differences between men and women wilcox.test(res08$FinishTime[res08$Sex=="M"],res08$FinishTime[res08$Sex!="M"],conf.int=TRUE) wilcox.test(res11$FinishTime[res11$Sex=="M"],res11$FinishTime[res08$Sex!="M"],conf.int=TRUE) # Or compare men 2008 with men 2011 and similar for women wilcox.test(res08$FinishTime[res08$Sex=="M"],res11$FinishTime[res11$Sex=="M"],conf.int=TRUE) wilcox.test(res08$FinishTime[res08$Sex!="M"],res11$FinishTime[res08$Sex!="M"],conf.int=TRUE) # We can have a look at all the categories recorded for 2008 cats=sort(unique(as.character(res08$txtCat2))) # Discard some strange categories and order the category levels as we want them tidycats=c("WJ","WS","W40","W45","W50","W55","W60","MJ","MS","M40","M45","M50","M55","M60","M65","M70") res08=res08[as.character(res08$txtCat2)%in%tidycats,] newcat=factor(res08$txtCat2,levels=tidycats) png("CatAge2008.png",width=700,height=400) boxplot(FinishTime~newcat,data=res08,col=c("red","red","red","red","red","red","red","blue","blue","blue","blue","blue","blue","blue","blue","blue"), main="Fields of Athenry 10k Finish Times Dec 2008",ylab="Finish Time (minutes)",cex.axis=0.9,ylim=c(30,110)) abline(h=runner08$FinishTime,lty=3) dev.off() # Do the same thing, but age match men and women side-by-side (2008) tidycats=c("WJ","MJ","WS","MS","W40","M40","W45","M45","W50","M50","W55","M55","W60","M60","M65","M70") newcat=factor(res08$txtCat2,levels=tidycats) png("CatSex2008.png",width=700,height=400) boxplot(FinishTime~newcat,data=res08,col=c("red","blue","red","blue","red","blue","red","blue","red","blue","red","blue","red","blue","blue","blue"), main="Fields of Athenry 10k Finish Times Dec 2008",ylab="Finish Time (minutes)",cex.axis=0.9,ylim=c(30,110)) abline(h=runner08$FinishTime,lty=3) dev.off() # We can have a look at all the categories recorded for 2011 cats=sort(unique(as.character(res11$Cat))) # Discard some strange categories and order the category levels as we want them tidycats=c("FJ","FS","F40","F45","F50","F55","F60","F65","MJ","MS","M40","M45","M50","M55","M60","M65","M70") res11=res11[as.character(res11$Cat)%in%tidycats,] newcat=factor(res11$Cat,levels=tidycats) png("CatAge2011.png",width=700,height=400) boxplot(FinishTime~newcat,data=res11,col=c("red","red","red","red","red","red","red","red","blue","blue","blue","blue","blue","blue","blue","blue","blue","blue"), main="Fields of Athenry 10k Finish Times Jan 2011",ylab="Finish Time (minutes)",cex.axis=0.9,ylim=c(30,110)) abline(h=runner11$FinishTime,lty=3) dev.off() # Do the same thing, but age match men and women side-by-side (2011) tidycats=c("FJ","MJ","FS","MS","F40","M40","F45","M45","F50","M50","F55","M55","F60","M60","F65","M65","M70") newcat=factor(res11$Cat,levels=tidycats) png("CatSex2011.png",width=700,height=400) boxplot(FinishTime~newcat,data=res11,col=c("red","blue","red","blue","red","blue","red","blue","red","blue","red","blue","red","blue","red","blue","blue","blue"), main="Fields of Athenry 10k Finish Times Jan 2011",ylab="Finish Time (minutes)",cex.axis=0.9,ylim=c(30,110)) abline(h=runner11$FinishTime,lty=3) dev.off() # We can try to find out how many people ran in both races # Names are formatted differently (two columns in 2011, # one column with CAPS and a comma in 2008) splitname=t(data.frame(strsplit(as.character(res08$txtName),split=", "))) # Let's add some sensible new column names to res08 res08$Name=toupper(splitname[,2]) res08$Surname=toupper(splitname[,1]) # We should convert names from res11 to upper case also for consistency res11$Name=toupper(res11$Name) res11$Surname=toupper(res11$Surname) # Now we can generate a full name to search on res08$Fullname=paste(res08$Name,res08$Surname) res11$Fullname=paste(res11$Name,res11$Surname) # We can look for names that occur in both years overlap=unique(res08$Fullname[res08$Fullname%in%res11$Fullname]) # Unfortunately there's no way to guarantee that these names # refer to the same people in both years, but generally this will be true length(overlap) print(overlap)