Modeling in R

(Simple) Linear Regression

Let’s predict length from width.

Begin with a plot

head(KidsFeet)
##     name birthmonth birthyear length width sex biggerfoot domhand
## 1  David          5        88   24.4   8.4   B          L       R
## 2   Lars         10        87   25.4   8.8   B          L       L
## 3   Zach         12        87   24.5   9.7   B          R       R
## 4   Josh          1        88   25.2   9.8   B          L       R
## 5   Lang          2        88   25.1   8.9   B          L       R
## 6 Scotty          3        88   25.7   9.7   B          R       R
gf_point(length ~ width, data = KidsFeet) %>%
  gf_lm(length ~ width)

Fit a model and summarize

Here is the summary several times over using different styes. msummary() gives a more minimal summary.

Kids.model <- lm(length ~ width, data = KidsFeet)
msummary(Kids.model)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.8172     2.9381   3.341  0.00192 ** 
## width         1.6576     0.3262   5.081  1.1e-05 ***
## 
## Residual standard error: 1.025 on 37 degrees of freedom
## Multiple R-squared:  0.411,  Adjusted R-squared:  0.3951 
## F-statistic: 25.82 on 1 and 37 DF,  p-value: 1.097e-05
Kids.model %>% msummary() 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.8172     2.9381   3.341  0.00192 ** 
## width         1.6576     0.3262   5.081  1.1e-05 ***
## 
## Residual standard error: 1.025 on 37 degrees of freedom
## Multiple R-squared:  0.411,  Adjusted R-squared:  0.3951 
## F-statistic: 25.82 on 1 and 37 DF,  p-value: 1.097e-05
lm(length ~ width, data = KidsFeet) %>%
  msummary()
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.8172     2.9381   3.341  0.00192 ** 
## width         1.6576     0.3262   5.081  1.1e-05 ***
## 
## Residual standard error: 1.025 on 37 degrees of freedom
## Multiple R-squared:  0.411,  Adjusted R-squared:  0.3951 
## F-statistic: 25.82 on 1 and 37 DF,  p-value: 1.097e-05

ANOVA table if you like

anova(Kids.model)
## Analysis of Variance Table
## 
## Response: length
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## width      1 27.114 27.1136  25.819 1.097e-05 ***
## Residuals 37 38.856  1.0502                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Function-based predictions

# create a function
predicted.length <- makeFun(Kids.model)
# use it to make a prediction
predicted.length(width = 8.5)
##        1 
## 23.90702
# use it to make multiple predictions
predicted.length(width = c(8, 8.5, 9))
##        1        2        3 
## 23.07820 23.90702 24.73583
# or predict for all widths in the data set
fitted(Kids.model)
##        1        2        3        4        5        6        7        8 
## 23.74125 24.40430 25.89616 26.06193 24.57007 25.89616 25.73040 24.40430 
##        9       10       11       12       13       14       15       16 
## 25.23312 24.40430 26.06193 24.57007 24.90159 26.06193 25.23312 22.91244 
##       17       18       19       20       21       22       23       24 
## 24.23854 24.40430 24.73583 25.56464 25.06735 24.07278 23.57549 24.73583 
##       25       26       27       28       29       30       31       32 
## 23.24397 25.39888 25.56464 25.56464 24.57007 25.23312 25.23312 24.07278 
##       33       34       35       36       37       38       39 
## 24.07278 24.73583 24.07278 23.90702 24.73583 22.91244 24.40430

But wait, there’s more – interval predictions

predicted.length( width = 9, interval = "confidence")
##        fit     lwr      upr
## 1 24.73583 24.4033 25.06835
predicted.length( width = 9, interval = "prediction")
##        fit      lwr      upr
## 1 24.73583 22.63299 26.83866
gf_point(length ~ width, data = KidsFeet) %>%
  gf_lm(length ~ width) %>%
  gf_pointrange(24.73 + 24.4 + 25.06 ~ 9, color = "red")

gf_pointrange()
## gf_pointrange uses a formula with shape y + ymin + ymax ~ x.
## Additional attributes include: 
##     alpha, color, group, linetype, size, fatten = 2
## For more information, try ?gf_pointrange
apropos("gf_")
##  [1] "gf_abline"     "gf_area"       "gf_bar"        "gf_boxplot"   
##  [5] "gf_coefline"   "gf_col"        "gf_contour"    "gf_count"     
##  [9] "gf_counts"     "gf_crossbar"   "gf_curve"      "gf_dens"      
## [13] "gf_density"    "gf_density_2d" "gf_density2d"  "gf_dotplot"   
## [17] "gf_errorbar"   "gf_errorbarh"  "gf_facet_grid" "gf_facet_wrap"
## [21] "gf_frame"      "gf_freqpoly"   "gf_fun"        "gf_function"  
## [25] "gf_hex"        "gf_histogram"  "gf_hline"      "gf_jitter"    
## [29] "gf_label"      "gf_labs"       "gf_lims"       "gf_line"      
## [33] "gf_linerange"  "gf_lm"         "gf_path"       "gf_point"     
## [37] "gf_pointrange" "gf_qq"         "gf_quantile"   "gf_raster"    
## [41] "gf_rect"       "gf_refine"     "gf_ribbon"     "gf_rug"       
## [45] "gf_segment"    "gf_smooth"     "gf_spline"     "gf_spoke"     
## [49] "gf_step"       "gf_text"       "gf_theme"      "gf_tile"      
## [53] "gf_violin"     "gf_vline"

Automated intervals for coefficients

confint(Kids.model, level = 0.9 )
##                  5 %      95 %
## (Intercept) 4.860349 14.774075
## width       1.107251  2.207997

Adding some predictors

Kids.model2 <- lm( length ~ width + domhand, data = KidsFeet)
msummary(Kids.model2)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.83401    2.98622   3.293  0.00223 ** 
## width        1.65288    0.33629   4.915 1.95e-05 ***
## domhandR     0.03255    0.41892   0.078  0.93849    
## 
## Residual standard error: 1.039 on 36 degrees of freedom
## Multiple R-squared:  0.4111, Adjusted R-squared:  0.3784 
## F-statistic: 12.57 on 2 and 36 DF,  p-value: 7.256e-05
mplot(Kids.model2, which = 1:7, system = "gg")
## [[1]]
## `geom_smooth()` using method = 'loess'

## 
## [[2]]

## 
## [[3]]
## `geom_smooth()` using method = 'loess'

## 
## [[4]]

## 
## [[5]]
## `geom_smooth()` using method = 'loess'

## 
## [[6]]
## `geom_smooth()` using method = 'loess'

## 
## [[7]]

gf_lm(length ~ width, color = ~domhand, data = KidsFeet)

plotModel(Kids.model2, system = "gg")

Adding interaction

Kids.model3 <- lm( length ~ width * domhand, data = KidsFeet)
msummary(Kids.model3)
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     21.1763    11.5455   1.834   0.0751 .
## width            0.3658     1.3095   0.279   0.7816  
## domhandR       -12.1304    11.9672  -1.014   0.3177  
## width:domhandR   1.3779     1.3549   1.017   0.3161  
## 
## Residual standard error: 1.038 on 35 degrees of freedom
## Multiple R-squared:  0.428,  Adjusted R-squared:  0.379 
## F-statistic:  8.73 on 3 and 35 DF,  p-value: 0.0001854
plotModel(Kids.model3, system = "gg")

Take a look in the effects package for a general way to plot complicted models.

Higher order polynomial terms and transformations

Kids.model4 <- lm( log(length) ~ poly(width,2) * domhand, data = KidsFeet)
msummary(Kids.model4)
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.18004    0.03694  86.082   <2e-16 ***
## poly(width, 2)1          -0.02044    0.21315  -0.096    0.924    
## poly(width, 2)2          -0.14225    0.28655  -0.496    0.623    
## domhandR                  0.02617    0.03777   0.693    0.493    
## poly(width, 2)1:domhandR  0.24313    0.21793   1.116    0.273    
## poly(width, 2)2:domhandR  0.14893    0.29018   0.513    0.611    
## 
## Residual standard error: 0.04313 on 33 degrees of freedom
## Multiple R-squared:  0.4336, Adjusted R-squared:  0.3477 
## F-statistic: 5.052 on 5 and 33 DF,  p-value: 0.001515

Generalized linear models

glm(sex ~ length * domhand, data = KidsFeet, family = binomial(link = logit)) %>%
  msummary()
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     -17.2086    18.0157  -0.955   0.3395  
## length            0.6826     0.7352   0.928   0.3532  
## domhandR         39.2148    20.4105   1.921   0.0547 .
## length:domhandR  -1.5661     0.8302  -1.886   0.0592 .
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54.040  on 38  degrees of freedom
## Residual deviance: 45.155  on 35  degrees of freedom
## AIC: 53.155
## 
## Number of Fisher Scoring iterations: 4
glm(sex ~ length * domhand, data = KidsFeet, family = binomial(link = logit)) %>%
  plotModel(system = "gg")

What about ANOVA

lm( length ~ sex * domhand, data = KidsFeet) %>%
  anova()
## Analysis of Variance Table
## 
## Response: length
##             Df Sum Sq Mean Sq F value  Pr(>F)  
## sex          1  5.988  5.9882  4.0263 0.05257 .
## domhand      1  1.723  1.7227  1.1583 0.28918  
## sex:domhand  1  6.205  6.2049  4.1721 0.04868 *
## Residuals   35 52.054  1.4872                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov( length ~ sex * domhand, data = KidsFeet) 
## Call:
##    aov(formula = length ~ sex * domhand, data = KidsFeet)
## 
## Terms:
##                      sex  domhand sex:domhand Residuals
## Sum of Squares   5.98815  1.72266     6.20492  52.05350
## Deg. of Freedom        1        1           1        35
## 
## Residual standard error: 1.219526
## Estimated effects may be unbalanced
# lattice version of interaction plot
xyplot(length ~ sex, groups = domhand, type = c("a", "p"), data = KidsFeet)

# ggformula version
gf_point(length ~ sex, color = ~domhand, data = KidsFeet) %>%
  gf_line(length ~ sex, color = ~domhand, data = KidsFeet, 
          stat = "summary", group = ~domhand, fun.y = mean)

A nonlinear model

nls( length ~ a + width^p, data = KidsFeet, start = list(a = 0, p = 1)) %>% 
  summary()
## 
## Formula: length ~ a + width^p
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)    
## a 11.86976    1.82905    6.49 1.37e-07 ***
## p  1.16251    0.06447   18.03  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.025 on 37 degrees of freedom
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 4.768e-08