# Load packages.
packages <- c("downloader", "ggplot2")
packages <- lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x)
library(x, character.only = TRUE)
}
})
This session focuses on (linear) models. The first section covers computation and graphical methods to detect linear trends in your data, using correlation coefficients and scatterplot matrixes. The second section is a rough guide to linear regression, and requires that you read extensively from your handbooks for the mathematical background.
Models are commonly used for all sorts of purposes that revolve around several definitions of prediction. Ordinary language defines prediction as forecasting: the ability to estimate something that is not yet in the data. A simple example would be Markus Gesmann's prediction of London Olympics 100m men's sprint results. Let's load his data:
# Identify the data file.
file = "data/olympics.2012.csv"
# Identify the data source.
link = "https://raw.github.com/briatte/ida/master/data/olympics.2012.csv"
# Download the data if needed.
if (!file.exists(file)) download(link, file, mode = "wb")
# Read the data.
olympics <- read.table(file, stringsAsFactors = FALSE)
# Check result.
head(olympics)
Year Event Athlete Medal Country Result
1 1896 100m Men Tom Burke GOLD USA 12.0
2 1900 100m Men Frank Jarvis GOLD USA 11.0
3 1904 100m Men Archie Hahn GOLD USA 11.0
4 1906 100m Men Archie Hahn GOLD USA 11.2
5 1908 100m Men Reggie Walker GOLD SAF 10.8
6 1912 100m Men Ralph Craig GOLD USA 10.8
In his code, Markus Gesmann uses a log-linear model to predict the 2012 sprint result from the series of past results, using all data from 1900 to today. To understand this model, we take a quick look at the sprinter results and add a LOESS curve to the plot, which provides a smoothed trend of the series:
# Scatterplot of sprinter results.
g <- qplot(data = olympics, y = Result, x = Year)
# Show plot with LOESS curve.
g + geom_smooth()
What appears is that, if we exclude the first data point from 1896, there seems to be a downward linear trend in the data. We can compute and visualize what a simple linear model of the form \(Y \text{(result)} = m X \text{(year)} + b\) would look like at that stage: it will tells us that, for every year that goes by, the sprinter time decreases by 0.01 seconds.
# Compute the sprint result as a function of its year, excluding 1896.
olympics.linear <- lm(Result ~ Year, data = olympics[-1, ])
# Show the result.
olympics.linear
Call:
lm(formula = Result ~ Year, data = olympics[-1, ])
Coefficients:
(Intercept) Year
33.0844 -0.0116
# Plot the result.
g + geom_abline(intercept = coef(olympics.linear)[1], slope = coef(olympics.linear)[2])
Because the marginal rate of improvement in the sprinter results is very small (\(4m \approx 0.04\) seconds every competition), it makes sense to measure it on a logarithmic scale instead. We then take the natural exponent of that result for year 2012 to get the predicted sprinter result in that year.
# Model the natural logarithm of the sprint result.
olympics.log.linear <- lm(log(Result) ~ Year, data = olympics[-1, ])
# Create the sequence of years for which to predict the result.
predicted.years <- data.frame(Year = seq(1900, 2012, 4))
# Predict the result for years 1900-2012.
predicted.times <- data.frame(Year = predicted.years,
exp(predict(olympics.log.linear,
newdata = predicted.years,
level = 0.95,
interval = "prediction")))
# Merge predictions to the original data.
olympics <- merge(olympics, predicted.times, by = "Year", all = TRUE)
# Check result, excluding a few columns.
tail(olympics)[-c(2, 4)]
Year Athlete Country Result fit lwr upr
26 1992 Linford Christie GBR 9.96 9.895 9.607 10.191
27 1996 Donovan Bailey CAN 9.84 9.851 9.563 10.147
28 2000 Maurice Greene USA 9.87 9.807 9.519 10.103
29 2004 Justin Gatlin USA 9.85 9.763 9.475 10.060
30 2008 Usain Bolt JAM 9.69 9.719 9.430 10.017
31 2012 <NA> <NA> NA 9.676 9.386 9.974
The log-linear prediction has added three columns of data (the prediction with its lower and upper bounds) and a few rows of data for several years, including year 2012. The graph below singles out that prediction, at 9.6756 seconds, and shows the rest of the predicted trend, along with the actual (observed) data.
# Plot the model.
qplot(data = olympics, x = Year, y = Result) +
geom_point(x = 2012, y = olympics$fit[31], color = "yellow", size = 10) +
geom_point(y = olympics$fit, color = "red") +
geom_line(y = olympics$upr, color = "red", linetype = "dashed") +
geom_line(y = olympics$lwr, color = "red", linetype = "dashed")
As Markus Gesmann notes, his model was very close to reality: in 2012, the best sprinter in the 100m men's run was only slightly quicker than predicted, at 9.63 seconds. This illustrates the common use of models to forecast results from data like financial markets, elections or sports events. Note that the model actually improves if you take year 1896 into account:
# Model the full data.
olympics.log.linear.full <- lm(log(Result) ~ Year, data = olympics)
# Predict the result for year 2012.
data.frame(Year = predicted.years,
exp(predict(olympics.log.linear.full,
newdata = predicted.years,
level = 0.95,
interval = "prediction")))[29, ]
Year fit lwr upr
29 2012 9.623 9.191 10.07
The next pages show a different use of linear models, where we look for linear trends in the existing data in order to interpret how different variables might be associated with each other. The forecasting of future values is mathematically possible but generally not of interest in such models, which are explanatory rather than predictive.
Next: Linear correlation.