#----------------------------------------------------------------------- # Linear regression when the predictors are numeric #----------------------------------------------------------------------- # Time-stamp: <2011-02-19 21:42 Petri Koistinen> # Use this file by entering the text a few lines at a time # in the R console, e.g., with the R editor. #-------------------------------- # Regression through the origin #-------------------------------- # The model formula 'y ~ x' means that the model includes both # a slope and an intercept (=constant term). Sometimes it # makes sense to assume that the regression line goes through # the origin. This can be specified by the model formula # y ~ x - 1 # The term '-1' means that we remove the intercept from the default model. # The model formula # y ~ 1 # specifies a model which only includes the intercept. e <- read.table('e1.dat') # Regression through the origin might be sensible because of physical # reasons. e.m0 <- lm(distance ~ stretch - 1, data = e) summary(e.m0) # Compare the R^2 values in these two models: the R^2 is much # larger for the model with no intercept. This does not mean # that the smaller model is better; the explanation is that # one uses different definitions for the quantity R^2 when # the linear model includes an intercept and when it does not. # The definition can be found here: ?summary.lm # Do not compare R^2 values of two models, if one of them includes # an intercept and the other does not! #------------------------- # Transformations #------------------------- # You can use transformations in the model formulas # Let's read in some data to illustrate. cement <- read.table('cement.dat', header = TRUE) plot(cement$Time, cement$Strength, xlab = 'Time', ylab = 'Tensile strength') # ... obviously, a regression line on the original scale is useless. plot(1/cement$Time, log(cement$Strength), xlab = '1/Time', ylab = 'Log tensile strength') # ... this looks better. cem.m <- lm(log(Strength) ~ I(1/Time), data = cement) abline(cem.m) # The term I(1/Time) here protects the expression 1/Time so # that it is not interpreted by the rules of model formulae but # as an ordinary arithmetical expression. (I() stands for the # identity function.) # In model formulas the operators +, -, *, :, ^ and / have a special # meaning. Use I() to protect those portions of the model formula # where you need these oprators in their algebraic meaning. See the # following page for more details. ?formula # Function predict() applies the transformations specified in the formula. # Let's add a confidence interval for the regression line t <- 1:28 matlines(1/t, predict(cem.m, data.frame(Time = t), interval = 'confidence')) # ... predict() does the transformation Time -> 1/Time and gives # the prediction for log(Strength) #----------------------------------------- # Multiple regression #----------------------------------------- # Fitting the model is as easy as in simple linear regression. # We use data set LifeCycleSavings data(LifeCycleSavings) ?LifeCycleSavings # We fit the model # # sr = beta.0 + beta.pop15 * pop15 + beta.pop75 * pop75 + ... # + beta.dpi * dpi + beta.ddpi * ddpi + epsilon, # # which has an intercept and slope parameters for each of the # explanatory variables. Here epsilon stands for a random # vector of errors, which has i.i.d. normally distributed # components with mean zero and the same variance. m <- lm(sr ~ pop15 + pop75 + dpi + ddpi, data = LifeCycleSavings) m summary(m) # The t-tests of the coefficients are for the hypothesis # that that coefficient is zero against the two-sided alternative. # The F-statistic is for testing all the predictors, i.e., for the # hypothesis that all the regression coefficients except the intercept # are zero. # You can apply the same extractor functions to m, which # we discussed in the context of simple linear regression. #------------------------- # Polynomial regression #------------------------- # Polynomial regression can be considered a special case of # multiple regression: now the explanatory variables are # the original explanatory variable together with its powers. # Try to explain the variable sr in terms of the variable ddpi. plot(sr ~ ddpi, data = LifeCycleSavings) s.m1 <- lm(sr ~ ddpi, data = LifeCycleSavings) abline(s.m1) # The data seems to curve away from the regression line; # let's try a second degree polynomial. s.m2 <- lm(sr ~ ddpi + I(ddpi^2), data = LifeCycleSavings) # ... It is necessary to isolate the term ddpi^2 here. # Add the plot of the fitted curve. u <- 0:17 lines(u, predict(s.m2, newdata = data.frame(ddpi = u))) # ... Once again, predict() does the needed transformation. # Of course, you can also use higher degree polynomials. # It is a good idea to use orthogonal polynomials to improve # the numerical stability. You can create such polynomials with poly(). # Let's try a third degree fit (using orhtogonal polynomials). (s.m3 <- lm(sr ~ poly(ddpi, 3), data = LifeCycleSavings)) # An alternative would be to fit the model # sr ~ ddpi + I(ddpi^2) + I(ddpi^3) # "Orthogonality" means here that ... (X <- model.matrix(s.m3)) t(X) %*% X # ... this matrix is diagonal # Let's round the numbers in order to see the pattern more clearly round(t(X) %*% X, 5) # Let's compare using the model matrix of s.m2 X <- model.matrix(s.m2) round(t(X) %*% X, 5) # Not diagonal # Plotting the regression curve is now a bit more complicated, # see ?predict.poly p <- predict(poly(LifeCycleSavings$ddpi, 3), newdata = u) lines(u, cbind(1, p) %*% coef(s.m3)) # The third degree fit is now practically the same as the # second degree polynomial, and you can guess that from summary(s.m3) # see the t-value and p-value of the third degree term. #-------------- # F test #-------------- # One often wants to compare the original ("full") linear model with a # linear model, which is a special case of the original model (the # reduced or restricted model or the submodel). This can be done by # calculating the value of the F-statistic, which does have the # F-distribution (with certain degrees of freedom) when the reduced # model is true, i.e. when the null hypothesis that the reduced model # is true holds. This requires fitting the two models and then # calculating the residual sum of squares and degrees of freedom # parameters for both models. # In R the F test is performed by anova(), when it is called as # anova(reduced.model, full.model) # Here reduced.model and full.model are model objects returned # by lm(). It is the user's responsibility to make sure that # the reduced model indeed can be obtained as a linear submodel # of the full linear model. # Let's test the null hypothesis that we do not need the third # degree term in the model # sr ~ ddpi + I(ddpi^2) + I(ddpi^3) anova(s.m2, s.m3) # ... the p-value is the same as in the t-test for the third # degree term. summary(s.m3) # Wait a second! Why is s.m2 a submodel of s.m3? The two models used # completely different polynomial bases! # Well, the basis poly(ddpi, 3) spans all third degree polynomials, # and any second degree polynomial can be obtained from that # basis by a suitable linear restiction on the coefficients. # Therefore the comparision was correct. # Let's do some further F tests using the LifeCycleSavings data m <- lm(sr ~ ., data = LifeCycleSavings) # This is the full model. The model formula means the model which # contains the other variables of the data frame as predictors # except sr, which is the response variable. # 1) Test of all predictors: # H0: all the slopes are zero m.red <- lm(sr ~ 1, data = LifeCycleSavings) anova(m.red, m) # This is the same test which you find on the row "F-statistic" summary(m) # 2) Test the null hypothesis that the coefficient of dpi is zero m.red <- lm(sr ~ . - dpi, data = LifeCycleSavings) # ... this model formula stands for the model which includes all the # variables of the data frame as predictors except sr (the response) # and dpi (which is removed with '-dpi'). anova(m.red, m) # This test is equivalent with the t-test on the row dpi: summary(m) # The F-statistic is the square of the t-statistic, and the # p-values are the same. This is a consequence of the theory # of linear models. # 3) Test the null hypothesis that the coefficients of pop15 and pop75 # are the same m.red <- lm(sr ~ I(pop15 + pop75) + dpi + ddpi, data = LifeCycleSavings) # Notice that we need to protect the expression pop15 + pop75 anova(m.red, m) # 4) Test the null hypothesis that the coefficient of ddpi is one # The reduced model is # # sr = beta.0 + beta.pop15 * pop15 + beta.pop75 * pop75 + # beta.dpi * dpi + ddpi + epsilon # # where there is no regression coefficient in front of ddpi. # This model can be fitted, when we declare that ddpi is an offset. m.red <- lm(sr ~ pop15 + pop75 + dpi + offset(ddpi), data = LifeCycleSavings) # Offset is a term (such as ddpi above) that we want to include in the # model without fitting a coefficient for it. anova(m.red, m) # 5) Can we test a null hypothesis such as # H0: beta.ddpi * beta.pop15 = -0.2 ? # Well, not with any F-test since the theory of F-tests does not # cover such non-linear restrictions on the regression coefficients.