#-----------------------------------------------------------------------
# Linear regression when the predictors are numeric
#-----------------------------------------------------------------------
# Time-stamp: <2011-02-19 21:42 Petri Koistinen>
# Use this file by entering the text a few lines at a time
# in the R console, e.g., with the R editor.
#--------------------------------
# Regression through the origin
#--------------------------------
# The model formula 'y ~ x' means that the model includes both
# a slope and an intercept (=constant term). Sometimes it
# makes sense to assume that the regression line goes through
# the origin. This can be specified by the model formula
# y ~ x - 1
# The term '-1' means that we remove the intercept from the default model.
# The model formula
# y ~ 1
# specifies a model which only includes the intercept.
e <- read.table('e1.dat')
# Regression through the origin might be sensible because of physical
# reasons.
e.m0 <- lm(distance ~ stretch - 1, data = e)
summary(e.m0)
# Compare the R^2 values in these two models: the R^2 is much
# larger for the model with no intercept. This does not mean
# that the smaller model is better; the explanation is that
# one uses different definitions for the quantity R^2 when
# the linear model includes an intercept and when it does not.
# The definition can be found here:
?summary.lm
# Do not compare R^2 values of two models, if one of them includes
# an intercept and the other does not!
#-------------------------
# Transformations
#-------------------------
# You can use transformations in the model formulas
# Let's read in some data to illustrate.
cement <- read.table('cement.dat', header = TRUE)
plot(cement$Time, cement$Strength,
xlab = 'Time', ylab = 'Tensile strength')
# ... obviously, a regression line on the original scale is useless.
plot(1/cement$Time, log(cement$Strength),
xlab = '1/Time', ylab = 'Log tensile strength')
# ... this looks better.
cem.m <- lm(log(Strength) ~ I(1/Time), data = cement)
abline(cem.m)
# The term I(1/Time) here protects the expression 1/Time so
# that it is not interpreted by the rules of model formulae but
# as an ordinary arithmetical expression. (I() stands for the
# identity function.)
# In model formulas the operators +, -, *, :, ^ and / have a special
# meaning. Use I() to protect those portions of the model formula
# where you need these oprators in their algebraic meaning. See the
# following page for more details.
?formula
# Function predict() applies the transformations specified in the formula.
# Let's add a confidence interval for the regression line
t <- 1:28
matlines(1/t, predict(cem.m, data.frame(Time = t), interval = 'confidence'))
# ... predict() does the transformation Time -> 1/Time and gives
# the prediction for log(Strength)
#-----------------------------------------
# Multiple regression
#-----------------------------------------
# Fitting the model is as easy as in simple linear regression.
# We use data set LifeCycleSavings
data(LifeCycleSavings)
?LifeCycleSavings
# We fit the model
#
# sr = beta.0 + beta.pop15 * pop15 + beta.pop75 * pop75 + ...
# + beta.dpi * dpi + beta.ddpi * ddpi + epsilon,
#
# which has an intercept and slope parameters for each of the
# explanatory variables. Here epsilon stands for a random
# vector of errors, which has i.i.d. normally distributed
# components with mean zero and the same variance.
m <- lm(sr ~ pop15 + pop75 + dpi + ddpi, data = LifeCycleSavings)
m
summary(m)
# The t-tests of the coefficients are for the hypothesis
# that that coefficient is zero against the two-sided alternative.
# The F-statistic is for testing all the predictors, i.e., for the
# hypothesis that all the regression coefficients except the intercept
# are zero.
# You can apply the same extractor functions to m, which
# we discussed in the context of simple linear regression.
#-------------------------
# Polynomial regression
#-------------------------
# Polynomial regression can be considered a special case of
# multiple regression: now the explanatory variables are
# the original explanatory variable together with its powers.
# Try to explain the variable sr in terms of the variable ddpi.
plot(sr ~ ddpi, data = LifeCycleSavings)
s.m1 <- lm(sr ~ ddpi, data = LifeCycleSavings)
abline(s.m1)
# The data seems to curve away from the regression line;
# let's try a second degree polynomial.
s.m2 <- lm(sr ~ ddpi + I(ddpi^2), data = LifeCycleSavings)
# ... It is necessary to isolate the term ddpi^2 here.
# Add the plot of the fitted curve.
u <- 0:17
lines(u, predict(s.m2, newdata = data.frame(ddpi = u)))
# ... Once again, predict() does the needed transformation.
# Of course, you can also use higher degree polynomials.
# It is a good idea to use orthogonal polynomials to improve
# the numerical stability. You can create such polynomials with poly().
# Let's try a third degree fit (using orhtogonal polynomials).
(s.m3 <- lm(sr ~ poly(ddpi, 3), data = LifeCycleSavings))
# An alternative would be to fit the model
# sr ~ ddpi + I(ddpi^2) + I(ddpi^3)
# "Orthogonality" means here that ...
(X <- model.matrix(s.m3))
t(X) %*% X # ... this matrix is diagonal
# Let's round the numbers in order to see the pattern more clearly
round(t(X) %*% X, 5)
# Let's compare using the model matrix of s.m2
X <- model.matrix(s.m2)
round(t(X) %*% X, 5) # Not diagonal
# Plotting the regression curve is now a bit more complicated,
# see ?predict.poly
p <- predict(poly(LifeCycleSavings$ddpi, 3), newdata = u)
lines(u, cbind(1, p) %*% coef(s.m3))
# The third degree fit is now practically the same as the
# second degree polynomial, and you can guess that from
summary(s.m3)
# see the t-value and p-value of the third degree term.
#--------------
# F test
#--------------
# One often wants to compare the original ("full") linear model with a
# linear model, which is a special case of the original model (the
# reduced or restricted model or the submodel). This can be done by
# calculating the value of the F-statistic, which does have the
# F-distribution (with certain degrees of freedom) when the reduced
# model is true, i.e. when the null hypothesis that the reduced model
# is true holds. This requires fitting the two models and then
# calculating the residual sum of squares and degrees of freedom
# parameters for both models.
# In R the F test is performed by anova(), when it is called as
# anova(reduced.model, full.model)
# Here reduced.model and full.model are model objects returned
# by lm(). It is the user's responsibility to make sure that
# the reduced model indeed can be obtained as a linear submodel
# of the full linear model.
# Let's test the null hypothesis that we do not need the third
# degree term in the model
# sr ~ ddpi + I(ddpi^2) + I(ddpi^3)
anova(s.m2, s.m3)
# ... the p-value is the same as in the t-test for the third
# degree term.
summary(s.m3)
# Wait a second! Why is s.m2 a submodel of s.m3? The two models used
# completely different polynomial bases!
# Well, the basis poly(ddpi, 3) spans all third degree polynomials,
# and any second degree polynomial can be obtained from that
# basis by a suitable linear restiction on the coefficients.
# Therefore the comparision was correct.
# Let's do some further F tests using the LifeCycleSavings data
m <- lm(sr ~ ., data = LifeCycleSavings)
# This is the full model. The model formula means the model which
# contains the other variables of the data frame as predictors
# except sr, which is the response variable.
# 1) Test of all predictors:
# H0: all the slopes are zero
m.red <- lm(sr ~ 1, data = LifeCycleSavings)
anova(m.red, m)
# This is the same test which you find on the row "F-statistic"
summary(m)
# 2) Test the null hypothesis that the coefficient of dpi is zero
m.red <- lm(sr ~ . - dpi, data = LifeCycleSavings)
# ... this model formula stands for the model which includes all the
# variables of the data frame as predictors except sr (the response)
# and dpi (which is removed with '-dpi').
anova(m.red, m)
# This test is equivalent with the t-test on the row dpi:
summary(m)
# The F-statistic is the square of the t-statistic, and the
# p-values are the same. This is a consequence of the theory
# of linear models.
# 3) Test the null hypothesis that the coefficients of pop15 and pop75
# are the same
m.red <- lm(sr ~ I(pop15 + pop75) + dpi + ddpi, data = LifeCycleSavings)
# Notice that we need to protect the expression pop15 + pop75
anova(m.red, m)
# 4) Test the null hypothesis that the coefficient of ddpi is one
# The reduced model is
#
# sr = beta.0 + beta.pop15 * pop15 + beta.pop75 * pop75 +
# beta.dpi * dpi + ddpi + epsilon
#
# where there is no regression coefficient in front of ddpi.
# This model can be fitted, when we declare that ddpi is an offset.
m.red <- lm(sr ~ pop15 + pop75 + dpi + offset(ddpi), data = LifeCycleSavings)
# Offset is a term (such as ddpi above) that we want to include in the
# model without fitting a coefficient for it.
anova(m.red, m)
# 5) Can we test a null hypothesis such as
# H0: beta.ddpi * beta.pop15 = -0.2 ?
# Well, not with any F-test since the theory of F-tests does not
# cover such non-linear restrictions on the regression coefficients.