# Place this .R file together with the two data files in a directory.
# Change the current directory in R to that directory (e.g. focus on R Console File->ChangeDir)
# Open the two relevant results files as R data frames

#http://www.athenryac.com/sites/www.athenryac.com/files/FullResult.txt
res08=read.delim("FullResult.txt",sep="\t",header=TRUE)
#http://www.athenryac.com/sites/www.athenryac.com/files/10foa10km_provisional.txt
res11=read.delim("10foa10km_provisional.txt",sep="\t",skip=1,header=TRUE)

# Already we can see a difference, the second file has a title line
# above the header describing the columns, which I skipped with the "skip=1" argument

# We can inspect the contents of these files by executing "edit(res11)" for example,
# or executing "edit(head(res11))" to just see the first few lines.

# The first dataset doesn't have a sex column, but we can extract that information 
# from the txtCat2 column and add it to the data frame ourselves
res08$Sex=substr(res08$txtCat2,1,1)

# Now we can convert the time strings in both data frames into R time objects
res08$FinishTime=as.numeric(as.difftime(as.character(res08$txtTime),format="%H:%M:%S"))
res11$FinishTime=as.numeric(as.difftime(as.character(res11$Chip.Time),format="%H:%M:%S"))

# We can find out how many people ran, and had their time recorded in each year
length(res08$FinishTime[!is.na(res08$FinishTime)])
length(res11$FinishTime[!is.na(res11$FinishTime)])

# Now we look at MY time.  These are my bib numbers for the two years
bib08=737
bib11=650

# We can filter out my data from the two data frames
runner08=res08[res08$txtEntrantNo==bib08,]
runner11=res11[res11$Bib==bib11,]

# We can generate summary tables for the finish times and compare with my times
summary(res08$FinishTime)
runner08$FinishTime
summary(res11$FinishTime)
runner11$FinishTime

png("FinishDist.png",width=700,height=400)

# We can look at the distribution of finishing times
plot(density(res08$FinishTime,na.rm=TRUE),lwd=3,xlab="Finish Time (mins)", main="Fields of Athenry 10k Results")
points(density(res11$FinishTime,na.rm=TRUE),type="l",lwd=3,col="red")
legend("topright",c("Dec 2008","Jan 2011"),col=c("black","red"),lwd=2.5)

# And indicate my finish times on the plot
abline(v=runner08$FinishTime, lwd="2.5", lty="dashed")
abline(v=runner11$FinishTime,lwd="2.5", lty="dotted",col="red")

dev.off()

# We can ask ourselves if the finishing times in 2011 were generally faster than those in 2008
# by performing a non-parametric Mann-Witney test for differences in median finishing time.
# This test doesn't assume any particular shape for distributions (they are obviously non-normal).
# Another alternative is to carry out a t-test for differences in the mean finishing times.  I don't like this
# as it assumes normal distributions.  Interestingly the two computed p-values are very different (though
# neither suggest a significant difference between years)
wilcox.test(res08$FinishTime,res11$FinishTime,conf.int=TRUE)
t.test(res08$FinishTime,res11$FinishTime,conf.int=TRUE)

# There are many other comparisons we can make, for instance we can compare mens times with womens times
png("SexCompare.png",width=700,height=400)

plot(density(res08$FinishTime[res08$Sex=="M"],na.rm=TRUE),lwd=3,xlab="Finish Time (mins)", main="Fields of Athenry 10k Results")
points(density(res08$FinishTime[res08$Sex!="M"],na.rm=TRUE),lwd=3,type="l",lty=2)
points(density(res11$FinishTime[res11$Sex=="M"],na.rm=TRUE),lwd=3,type="l",col="red",)
points(density(res11$FinishTime[res11$Sex!="M"],na.rm=TRUE),lwd=3,type="l",col="red",lty=2)
legend("topright",c("Men Dec 2008","Women Dec 2008","Men Jan 2011","Women Jan 2011"),col=c("black","black","red","red"),lwd=2.5,lty=c(1,2,1,2))

dev.off()

# And look at summaries for these
summary(res08$FinishTime[res08$Sex=="M"])
summary(res08$FinishTime[res08$Sex!="M"])
summary(res11$FinishTime[res11$Sex=="M"])
summary(res11$FinishTime[res11$Sex!="M"])

# And test for differences between men and women
wilcox.test(res08$FinishTime[res08$Sex=="M"],res08$FinishTime[res08$Sex!="M"],conf.int=TRUE)
wilcox.test(res11$FinishTime[res11$Sex=="M"],res11$FinishTime[res08$Sex!="M"],conf.int=TRUE)

# Or compare men 2008 with men 2011 and similar for women
wilcox.test(res08$FinishTime[res08$Sex=="M"],res11$FinishTime[res11$Sex=="M"],conf.int=TRUE)
wilcox.test(res08$FinishTime[res08$Sex!="M"],res11$FinishTime[res08$Sex!="M"],conf.int=TRUE)

# We can have a look at all the categories recorded for 2008
cats=sort(unique(as.character(res08$txtCat2)))
# Discard some strange categories and order the category levels as we want them
tidycats=c("WJ","WS","W40","W45","W50","W55","W60","MJ","MS","M40","M45","M50","M55","M60","M65","M70")
res08=res08[as.character(res08$txtCat2)%in%tidycats,]
newcat=factor(res08$txtCat2,levels=tidycats)
png("CatAge2008.png",width=700,height=400)
boxplot(FinishTime~newcat,data=res08,col=c("red","red","red","red","red","red","red","blue","blue","blue","blue","blue","blue","blue","blue","blue"),
main="Fields of Athenry 10k Finish Times Dec 2008",ylab="Finish Time (minutes)",cex.axis=0.9,ylim=c(30,110))
abline(h=runner08$FinishTime,lty=3)
dev.off()

# Do the same thing, but age match men and women side-by-side (2008)
tidycats=c("WJ","MJ","WS","MS","W40","M40","W45","M45","W50","M50","W55","M55","W60","M60","M65","M70")
newcat=factor(res08$txtCat2,levels=tidycats)
png("CatSex2008.png",width=700,height=400)
boxplot(FinishTime~newcat,data=res08,col=c("red","blue","red","blue","red","blue","red","blue","red","blue","red","blue","red","blue","blue","blue"),
main="Fields of Athenry 10k Finish Times Dec 2008",ylab="Finish Time (minutes)",cex.axis=0.9,ylim=c(30,110))
abline(h=runner08$FinishTime,lty=3)
dev.off()

# We can have a look at all the categories recorded for 2011
cats=sort(unique(as.character(res11$Cat)))
# Discard some strange categories and order the category levels as we want them
tidycats=c("FJ","FS","F40","F45","F50","F55","F60","F65","MJ","MS","M40","M45","M50","M55","M60","M65","M70")
res11=res11[as.character(res11$Cat)%in%tidycats,]
newcat=factor(res11$Cat,levels=tidycats)
png("CatAge2011.png",width=700,height=400)
boxplot(FinishTime~newcat,data=res11,col=c("red","red","red","red","red","red","red","red","blue","blue","blue","blue","blue","blue","blue","blue","blue","blue"),
main="Fields of Athenry 10k Finish Times Jan 2011",ylab="Finish Time (minutes)",cex.axis=0.9,ylim=c(30,110))
abline(h=runner11$FinishTime,lty=3)
dev.off()

# Do the same thing, but age match men and women side-by-side (2011)
tidycats=c("FJ","MJ","FS","MS","F40","M40","F45","M45","F50","M50","F55","M55","F60","M60","F65","M65","M70")
newcat=factor(res11$Cat,levels=tidycats)
png("CatSex2011.png",width=700,height=400)
boxplot(FinishTime~newcat,data=res11,col=c("red","blue","red","blue","red","blue","red","blue","red","blue","red","blue","red","blue","red","blue","blue","blue"),
main="Fields of Athenry 10k Finish Times Jan 2011",ylab="Finish Time (minutes)",cex.axis=0.9,ylim=c(30,110))
abline(h=runner11$FinishTime,lty=3)
dev.off()

# We can try to find out how many people ran in both races
# Names are formatted differently (two columns in 2011,
# one column with CAPS and a comma in 2008)
splitname=t(data.frame(strsplit(as.character(res08$txtName),split=", ")))
# Let's add some sensible new column names to res08
res08$Name=toupper(splitname[,2])
res08$Surname=toupper(splitname[,1])
# We should convert names from res11 to upper case also for consistency
res11$Name=toupper(res11$Name)
res11$Surname=toupper(res11$Surname)
# Now we can generate a full name to search on
res08$Fullname=paste(res08$Name,res08$Surname)
res11$Fullname=paste(res11$Name,res11$Surname)
# We can look for names that occur in both years
overlap=unique(res08$Fullname[res08$Fullname%in%res11$Fullname])
# Unfortunately there's no way to guarantee that these names
# refer to the same people in both years, but generally this will be true
length(overlap)
print(overlap)