dplyr presentation

Jo Hardin

June 21 & 22, 2016

Tidy Data

rows (cases/observational units) and
columns (variables).
The key is that every row is a case and every column is a variable.
No exceptions.

Chaining

The pipe syntax (%>%) takes a data frame (or data table) and sends it to the argument of a function.

x %>% f(y) is the same as f(x, y)
y %>% f(x, ., z) is the same as f(x,y,z)

Building Tidy Data

object_name <- function_name(arguments)
object_name <- data_table %>% function_name(arguments)
object_name <-
data_table %>%
function_name(arguments) %>%
function_name(arguments)
in chaining, the value (on left) %>% is first argument to the function (on right)

5 Main Data Verbs

Data verbs take data tables as input and give data tables as output

select(): subsets variables (and rename() )
filter(): subsets unwanted cases
mutate(): transforms the variable (and transmute() like mutate, returns only new variables)
arrange(): reorders the cases
summarize(): computes summary statistics

Other Data Verbs

distinct(): returns the unique values in a table
sample_n(): take a random row(s)
head(): grab the first few rows
tail(): grab the last few rows
group_by(): SUCCESSIVE functions are applied to groups
ungroup(): reverse the grouping action
summarise():
- min(), max(), mean(), sum(), sd(), median(), and IQR()
- n(): number of observations in the current group
- n_distinct(): number of unique values
- first_value(), last_value(), and nth_value(x, n): (like x[1], x[length(x)], and x[n] )

You should know that I did this:

require(mosaic)
require(babynames)
require(NHANES)
Babynames <- babynames
names(NHANES) <- tolower(names(NHANES))

Finally, some Examples!

Babynames %>% nrow()

## [1] 1825433

Babynames %>% names()

## [1] "year" "sex"  "name" "n"    "prop"

Finally, some Examples!

Babynames %>% glimpse()

## Observations: 1,825,433
## Variables: 5
## $ year (dbl) 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 188...
## $ sex  (chr) "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F...
## $ name (chr) "Mary", "Anna", "Emma", "Elizabeth", "Minnie", "Margaret"...
## $ n    (int) 7065, 2604, 2003, 1939, 1746, 1578, 1472, 1414, 1320, 128...
## $ prop (dbl) 0.072383587, 0.026678961, 0.020521490, 0.019865786, 0.017...

Finally, some Examples!

Babynames %>% head()

## Source: local data frame [6 x 5]
## 
##    year   sex      name     n       prop
##   (dbl) (chr)     (chr) (int)      (dbl)
## 1  1880     F      Mary  7065 0.07238359
## 2  1880     F      Anna  2604 0.02667896
## 3  1880     F      Emma  2003 0.02052149
## 4  1880     F Elizabeth  1939 0.01986579
## 5  1880     F    Minnie  1746 0.01788843
## 6  1880     F  Margaret  1578 0.01616720

Finally, some Examples!

Babynames %>% sample_n(size=5)

## Source: local data frame [5 x 5]
## 
##    year   sex    name     n         prop
##   (dbl) (chr)   (chr) (int)        (dbl)
## 1  2001     F   Deven    32 1.616645e-05
## 2  1919     M    Lois    76 7.485104e-05
## 3  1960     M   Julie    33 1.523510e-05
## 4  1974     F Brookie     7 4.469699e-06
## 5  1997     F   Dejia    22 1.152798e-05

NHANES Data

names(NHANES)

##  [1] "id"               "surveyyr"         "gender"          
##  [4] "age"              "agedecade"        "agemonths"       
##  [7] "race1"            "race3"            "education"       
## [10] "maritalstatus"    "hhincome"         "hhincomemid"     
## [13] "poverty"          "homerooms"        "homeown"         
## [16] "work"             "weight"           "length"          
## [19] "headcirc"         "height"           "bmi"             
## [22] "bmicatunder20yrs" "bmi_who"          "pulse"           
## [25] "bpsysave"         "bpdiaave"         "bpsys1"          
## [28] "bpdia1"           "bpsys2"           "bpdia2"          
## [31] "bpsys3"           "bpdia3"           "testosterone"    
## [34] "directchol"       "totchol"          "urinevol1"       
## [37] "urineflow1"       "urinevol2"        "urineflow2"      
## [40] "diabetes"         "diabetesage"      "healthgen"       
## [43] "daysphyshlthbad"  "daysmenthlthbad"  "littleinterest"  
## [46] "depressed"        "npregnancies"     "nbabies"         
## [49] "age1stbaby"       "sleephrsnight"    "sleeptrouble"    
## [52] "physactive"       "physactivedays"   "tvhrsday"        
## [55] "comphrsday"       "tvhrsdaychild"    "comphrsdaychild" 
## [58] "alcohol12plusyr"  "alcoholday"       "alcoholyear"     
## [61] "smokenow"         "smoke100"         "smoke100n"       
## [64] "smokeage"         "marijuana"        "agefirstmarij"   
## [67] "regularmarij"     "ageregmarij"      "harddrugs"       
## [70] "sexever"          "sexage"           "sexnumpartnlife" 
## [73] "sexnumpartyear"   "samesex"          "sexorientation"  
## [76] "pregnantnow"

1. select(): subsets variables

find the sleep variables

NHANESsleep <- 
  NHANES %>% 
  select(gender, age, weight, race1, race3, education, sleeptrouble,
         sleephrsnight, tvhrsday, tvhrsdaychild, physactive)

names(NHANESsleep)

##  [1] "gender"        "age"           "weight"        "race1"        
##  [5] "race3"         "education"     "sleeptrouble"  "sleephrsnight"
##  [9] "tvhrsday"      "tvhrsdaychild" "physactive"

dim(NHANESsleep)

## [1] 10000    11

2. filter(): subsets cases

subset for college students

NHANESsleep <- NHANESsleep %>% filter(age %in% c(18:22))
histogram(~age, data=NHANESsleep)

3. mutate(): transforms the variable

mutate or transmute to create a new variable?

NHANESsleep %>% mutate(weightlb = weight*2.2) %>% head(3)

## Source: local data frame [3 x 12]
## 
##   gender   age weight  race1  race3    education sleeptrouble
##   (fctr) (int)  (dbl) (fctr) (fctr)       (fctr)       (fctr)
## 1 female    21  103.5  Black     NA Some College          Yes
## 2 female    21  103.5  Black     NA Some College          Yes
## 3 female    22   81.8  Black     NA Some College           No
## Variables not shown: sleephrsnight (int), tvhrsday (fctr), tvhrsdaychild
##   (int), physactive (fctr), weightlb (dbl)

3. mutate(): transforms the variable

mutate or transmute to create a new variable?

NHANESsleep %>% transmute(weightlb = weight*2.2) %>% head(3)

## Source: local data frame [3 x 1]
## 
##   weightlb
##      (dbl)
## 1   227.70
## 2   227.70
## 3   179.96

5. summarize(): computes summary statistics

# number of people (cases) in NHANES
NHANES %>% summarise(n())

## Source: local data frame [1 x 1]
## 
##     n()
##   (int)
## 1 10000

5. summarize(): computes summary statistics

# total weight of all the people in NHANES (silly)
NHANES %>% 
  mutate(weightlb = weight*2.2) %>% 
  summarise(sum(weightlb, na.rm=TRUE))

## Source: local data frame [1 x 1]
## 
##   sum(weightlb, na.rm = TRUE)
##                         (dbl)
## 1                     1549419

5. summarize(): computes summary statistics

# mean weight of all the people in NHANES
NHANES %>% 
  mutate(weightlb = weight*2.2) %>% 
  summarise(mean(weightlb, na.rm=TRUE))

## Source: local data frame [1 x 1]
## 
##   mean(weightlb, na.rm = TRUE)
##                          (dbl)
## 1                       156.16

5. summarize() with group_by()

# mean weight of all the people in NHANES, broken down by education
NHANES %>% 
  mutate(weightlb = weight*2.2) %>% 
  group_by(education) %>% 
  summarise(mean(weightlb, na.rm=TRUE))

## Source: local data frame [6 x 2]
## 
##        education mean(weightlb, na.rm = TRUE)
##           (fctr)                        (dbl)
## 1      8th Grade                    173.03296
## 2 9 - 11th Grade                    180.53893
## 3    High School                    183.10185
## 4   Some College                    185.20643
## 5   College Grad                    176.77638
## 6             NA                     91.69645

4. arrange(): reorders the cases

# mean weight, by education, sorted
NHANES %>% 
  mutate(weightlb = weight*2.2) %>% 
  group_by(education) %>% 
  summarise(avewt = mean(weightlb, na.rm=TRUE)) %>% 
  arrange(avewt)

## Source: local data frame [6 x 2]
## 
##        education     avewt
##           (fctr)     (dbl)
## 1             NA  91.69645
## 2      8th Grade 173.03296
## 3   College Grad 176.77638
## 4 9 - 11th Grade 180.53893
## 5    High School 183.10185
## 6   Some College 185.20643

Your Turn

When starting, it can be helpful to work with a small subset of the data. When you have your data wrangling statements in working order, shift to the entire data table.

SmallSubset <-
  Babynames %>%
  filter(year > 2000) %>%
  sample_n(size = 200)

names(SmallSubset)

## [1] "year" "sex"  "name" "n"    "prop"

How many babies are represented?

SmallSubset %>%
  summarise(total = ????(n)) # a reduction verb

How many babies are represented?

SmallSubset %>%
  summarise(total = sum(n))

## Source: local data frame [1 x 1]
## 
##   total
##   (int)
## 1 28365

How many babies are there in each year?

SmallSubset %>% 
  group_by(????) %>% 
  summarise(total = ????(n))

How many babies are there in each year?

SmallSubset %>% 
  group_by(year) %>% 
  summarise(total = sum(n))

## Source: local data frame [14 x 2]
## 
##     year total
##    (dbl) (int)
## 1   2001   760
## 2   2002  4093
## 3   2003   818
## 4   2004  3725
## 5   2005  9810
## 6   2006   370
## 7   2007  1950
## 8   2008   229
## 9   2009   869
## 10  2010  1328
## 11  2011   464
## 12  2012  2252
## 13  2013  1463
## 14  2014   234

How many distinct names in each year?

SmallSubset %>%
  group_by(????) %>%
  summarise(name_count = n_distinct(????))

How many distinct names in each year?

SmallSubset %>%
  group_by(year) %>%
  summarise(name_count = n_distinct(name))

## Source: local data frame [14 x 2]
## 
##     year name_count
##    (dbl)      (int)
## 1   2001         20
## 2   2002          7
## 3   2003         11
## 4   2004         15
## 5   2005         11
## 6   2006         18
## 7   2007         14
## 8   2008         18
## 9   2009         12
## 10  2010         19
## 11  2011          9
## 12  2012         14
## 13  2013         20
## 14  2014         12

How many distinct names of each sex in each year?

SmallSubset %>%
  group_by(????, ????) %>%
  summarise(????)

How many distinct names of each sex in each year?

temp <- SmallSubset %>%
  group_by(year, sex) %>%
  summarise(name_count=n_distinct(name))
data.frame(temp)

##    year sex name_count
## 1  2001   F         10
## 2  2001   M         10
## 3  2002   F          2
## 4  2002   M          5
## 5  2003   F          8
## 6  2003   M          3
## 7  2004   F          8
## 8  2004   M          7
## 9  2005   F          4
## 10 2005   M          7
## 11 2006   F         11
## 12 2006   M          7
## 13 2007   F          8
## 14 2007   M          6
## 15 2008   F         11
## 16 2008   M          7
## 17 2009   F         10
## 18 2009   M          2
## 19 2010   F         12
## 20 2010   M          7
## 21 2011   F          4
## 22 2011   M          5
## 23 2012   F          9
## 24 2012   M          5
## 25 2013   F          9
## 26 2013   M         11
## 27 2014   F          6
## 28 2014   M          6

Track the yearly number of Janes and Marys over the years.

Result <-
  Babynames %>%
  ????(name %in% c("Jane", "Mary")) %>% # just the Janes and Marys
  group_by(????, ????) %>% # for each year for each name
  summarise(count = ????)

Track the yearly number of Janes and Marys over the years.

Result <-
  Babynames %>%
  filter(name %in% c("Jane", "Mary")) %>% # just the Janes and Marys
  group_by(name, year) %>% # for each year for each name
  summarise(count = sum(n))

Plot out the result

Put year on the x-axis and the count of each name on the y-axis. Note that ggplot() commands use + rather than %>%.

ggplot(data=Result, aes(x = year, y = count)) +
  geom_point()

Map the name (Mary or Jane) to the aesthetic of color. Remember that mapping to aesthetics is always done inside the aes() function.
Instead of using dots as the glyph, use a line that connects consecutive values: geom_line().
Change the y-axis label to “Yearly Births”: + ylab("Yearly Births")
Set the line thickness to size=2. Remember that “setting” refers to adjusting the value of an aesthetic to a constant. Thus, it’s outside the aes() function.

Plot out the result

ggplot(data=Result, aes(x = year, y = count)) +
  geom_line(aes(color=name), size=2) + 
  ylab("Yearly Births")

Look at the proportion of births rather than the count

Filter first on female, then on Mary and Jane. Meanwhile, calculate the proportion of names in the dataset which are Mary and Jane (as opposed to the proportion of all babies from the given year which is the prop variable).

Result2 <-
  Babynames %>%
  group_by(year) %>%
  mutate(total = ????(n)) %>%
  filter(????) %>%
  mutate(proportion = n / total)

Look at the proportion of births rather than the count

Just as you did with count vs year, graph proportion vs year.

Result2 %>%
  Your ggplot statements go here!

Add a vertical line to mark a year in which something happened that might relate to the increase or decrease the popularity of the name. Example: The movie Whatever Happened to Baby Jane came out in 1962. The glyph is a vertical line: geom_vline().

Look at the proportion of births rather than the count

Result2 <-
  Babynames %>%
  filter(sex == "F") %>%
  filter(sex %in% c("F"))  %>%
  group_by(year) %>%
  mutate(total = sum(n)) %>%
  filter(name %in% c("Mary", "Jane")) %>%
  mutate(proportion = n / total)

Look at the proportion of births rather than the count

ggplot(data=Result2, aes(x=year, y=proportion)) +
  geom_line(aes(color=name), size=2) +
  ylab("Yearly Births") +
  geom_vline(xintercept=1962)

Pick out name(s) of interest to you

Plot out their popularity over time.

Pick out name(s) of interest to you

Frances <- babynames %>%
  filter(name== "Frances")

Pick out name(s) of interest to you

Frances %>% ggplot(aes(x=year, y=n)) +
  geom_point(aes(color=sex), size=.5) + 
  geom_line(aes(color=sex)) + 
  geom_vline(xintercept=2006) + scale_y_log10() +
  ylab("Yearly total on log10 scale")

gather / spread (new verbs)

http://garrettgman.github.io/tidying/

-new_sp_m014 - new_rel_f65: Counts of new TB cases recorded by group.

-code for method of diagnosis (rel = relapse, sn = negative pulmonary smear, sp = positive pulmonary smear, ep = extrapulmonary)
-code for gender (f = female, m = male)
-code for age group (014 = 0-14 yrs of age, 1524 = 15-24 years of age, 2534 = 25 to 34 years of age, 3544 = 35 to 44 years of age, 4554 = 45 to 54 years of age, 5564 = 55 to 64 years of age, 65 = 65 years of age or older).

#install.packages(c("tidyr", "devtools"))
#devtools::install_github("garrettgman/DSR")
require(tidyr)
require(DSR)
data(who)
head(who)

## Source: local data frame [6 x 60]
## 
##       country  iso2  iso3  year new_sp_m014 new_sp_m1524 new_sp_m2534
##         <chr> <chr> <chr> <int>       <int>        <int>        <int>
## 1 Afghanistan    AF   AFG  1980          NA           NA           NA
## 2 Afghanistan    AF   AFG  1981          NA           NA           NA
## 3 Afghanistan    AF   AFG  1982          NA           NA           NA
## 4 Afghanistan    AF   AFG  1983          NA           NA           NA
## 5 Afghanistan    AF   AFG  1984          NA           NA           NA
## 6 Afghanistan    AF   AFG  1985          NA           NA           NA
## Variables not shown: new_sp_m3544 <int>, new_sp_m4554 <int>, new_sp_m5564
##   <int>, new_sp_m65 <int>, new_sp_f014 <int>, new_sp_f1524 <int>,
##   new_sp_f2534 <int>, new_sp_f3544 <int>, new_sp_f4554 <int>, new_sp_f5564
##   <int>, new_sp_f65 <int>, new_sn_m014 <int>, new_sn_m1524 <int>,
##   new_sn_m2534 <int>, new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564
##   <int>, new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
##   new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>, new_sn_f5564
##   <int>, new_sn_f65 <int>, new_ep_m014 <int>, new_ep_m1524 <int>,
##   new_ep_m2534 <int>, new_ep_m3544 <int>, new_ep_m4554 <int>, new_ep_m5564
##   <int>, new_ep_m65 <int>, new_ep_f014 <int>, new_ep_f1524 <int>,
##   new_ep_f2534 <int>, new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564
##   <int>, new_ep_f65 <int>, new_rel_m014 <int>, new_rel_m1524 <int>,
##   new_rel_m2534 <int>, new_rel_m3544 <int>, new_rel_m4554 <int>,
##   new_rel_m5564 <int>, new_rel_m65 <int>, new_rel_f014 <int>,
##   new_rel_f1524 <int>, new_rel_f2534 <int>, new_rel_f3544 <int>,
##   new_rel_f4554 <int>, new_rel_f5564 <int>, new_rel_f65 <int>.

gather

whoTidy <- who %>% gather("code", "value", 5:60)
whoTidy %>% sample_n(10)

## Source: local data frame [10 x 6]
## 
##         country  iso2  iso3  year          code value
##           <chr> <chr> <chr> <int>         <chr> <int>
## 1    Seychelles    SC   SYC  1982  new_sn_m2534    NA
## 2   Switzerland    CH   CHE  2006  new_sn_f5564     6
## 3       Tunisia    TN   TUN  1996   new_ep_f014    NA
## 4    Mozambique    MZ   MOZ  2005 new_rel_m1524    NA
## 5       Germany    DE   DEU  2007  new_sn_m1524   106
## 6          Oman    OM   OMN  1989   new_ep_f014    NA
## 7       Grenada    GD   GRD  1985  new_ep_f4554    NA
## 8  Burkina Faso    BF   BFA  2010  new_ep_f3544    NA
## 9        Zambia    ZM   ZMB  1996 new_rel_m1524    NA
## 10       Turkey    TR   TUR  1980  new_sn_m1524    NA

separate

whoTidy <- whoTidy %>% separate(code, c("new", "var", "sexage"))
whoTidy %>% sample_n(10)

## Source: local data frame [10 x 8]
## 
##                        country  iso2  iso3  year   new   var sexage value
##                          <chr> <chr> <chr> <int> <chr> <chr>  <chr> <int>
## 1            Brunei Darussalam    BN   BRN  2012   new    sn  f2534     3
## 2                      Austria    AT   AUT  1983   new    sn  f4554    NA
## 3                      Uruguay    UY   URY  1991   new    sn  f3544    NA
## 4                        Nepal    NP   NPL  2006   new    sp  f5564   519
## 5                  El Salvador    SV   SLV  1980   new   rel  f3544    NA
## 6                       France    FR   FRA  1983   new    sn  m5564    NA
## 7                      Armenia    AM   ARM  2010   new    ep  f4554    28
## 8             China, Macao SAR    MO   MAC  1990   new    sn  m1524    NA
## 9                        Italy    IT   ITA  1992   new    sp  m3544    NA
## 10 United Republic of Tanzania    TZ   TZA  2001   new    sn  f3544    NA

separate

whoTidy <- whoTidy %>% separate(sexage, c("sex", "age"), sep = 1)
whoTidy %>% sample_n(10)

## Source: local data frame [10 x 9]
## 
##                   country  iso2  iso3  year   new   var   sex   age value
##                     <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <int>
## 1                    Chad    TD   TCD  1984   new    sp     m  2534    NA
## 2                 Georgia    GE   GEO  2001   new   rel     m  2534    NA
## 3                 Grenada    GD   GRD  1981   new   rel     f   014    NA
## 4              Montserrat    MS   MSR  1995   new    ep     f  5564    NA
## 5                   Benin    BJ   BEN  1999   new    sp     f    65    30
## 6              Azerbaijan    AZ   AZE  1982   new    sp     f  2534    NA
## 7                Zimbabwe    ZW   ZWE  1988   new   rel     m    65    NA
## 8                   Malta    MT   MLT  2006   new    sn     f  4554     0
## 9                   Libya    LY   LBY  2006   new    sn     m  2534    NA
## 10 Bosnia and Herzegovina    BA   BIH  1996   new    sn     f   014    NA

spread

whoTidy <- whoTidy %>% spread(var, value)
whoTidy %>% sample_n(10)

## Source: local data frame [10 x 11]
## 
##                             country  iso2  iso3  year   new   sex   age
##                               <chr> <chr> <chr> <int> <chr> <chr> <chr>
## 1                             Qatar    QA   QAT  2010   new     m  1524
## 2                           Belarus    BY   BLR  1994   new     m  2534
## 3              Netherlands Antilles    AN   ANT  1993   new     f  3544
## 4                        Seychelles    SC   SYC  1993   new     m  5564
## 5                      Saudi Arabia    SA   SAU  1998   new     m  3544
## 6                            Zambia    ZM   ZMB  1997   new     f  2534
## 7                     Guinea-Bissau    GW   GNB  1986   new     f  2534
## 8                              Fiji    FJ   FJI  1994   new     f  2534
## 9  Micronesia (Federated States of)    FM   FSM  1999   new     m  5564
## 10                       Kyrgyzstan    KG   KGZ  1980   new     m  2534
## Variables not shown: ep <int>, rel <int>, sn <int>, sp <int>.

join (flights)

require(nycflights13)
names(flights)

##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"

Flights <- flights
Airports <- airports
Airlines <- airlines
Weather <- weather
Planes <- planes

join (flights)

names(Flights)

##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"

names(Airports)

## [1] "faa"  "name" "lat"  "lon"  "alt"  "tz"   "dst"

names(Airlines)

## [1] "carrier" "name"

names(Weather)

##  [1] "origin"     "year"       "month"      "day"        "hour"      
##  [6] "temp"       "dewp"       "humid"      "wind_dir"   "wind_speed"
## [11] "wind_gust"  "precip"     "pressure"   "visib"      "time_hour"

names(Planes)

## [1] "tailnum"      "year"         "type"         "manufacturer"
## [5] "model"        "engines"      "seats"        "speed"       
## [9] "engine"

join (flights)

Flights %>% select(carrier, flight, tailnum, origin, dest) %>% head()

## Source: local data frame [6 x 5]
## 
##   carrier flight tailnum origin  dest
##     <chr>  <int>   <chr>  <chr> <chr>
## 1      UA   1545  N14228    EWR   IAH
## 2      UA   1714  N24211    LGA   IAH
## 3      AA   1141  N619AA    JFK   MIA
## 4      B6    725  N804JB    JFK   BQN
## 5      DL    461  N668DN    LGA   ATL
## 6      UA   1696  N39463    EWR   ORD

Airports %>% select(faa, name, lat, lon) %>% head()

## Source: local data frame [6 x 4]
## 
##     faa                           name      lat       lon
##   <chr>                          <chr>    <dbl>     <dbl>
## 1   04G              Lansdowne Airport 41.13047 -80.61958
## 2   06A  Moton Field Municipal Airport 32.46057 -85.68003
## 3   06C            Schaumburg Regional 41.98934 -88.10124
## 4   06N                Randall Airport 41.43191 -74.39156
## 5   09J          Jekyll Island Airport 31.07447 -81.42778
## 6   0A9 Elizabethton Municipal Airport 36.37122 -82.17342

Flights %>% 
  inner_join(Airports, by=c("dest" = "faa")) %>%
  select(carrier, flight, tailnum, origin, dest, name, lat, lon) %>%
  head()

## Source: local data frame [6 x 8]
## 
##   carrier flight tailnum origin  dest                            name
##     <chr>  <int>   <chr>  <chr> <chr>                           <chr>
## 1      UA   1545  N14228    EWR   IAH    George Bush Intercontinental
## 2      UA   1714  N24211    LGA   IAH    George Bush Intercontinental
## 3      AA   1141  N619AA    JFK   MIA                      Miami Intl
## 4      DL    461  N668DN    LGA   ATL Hartsfield Jackson Atlanta Intl
## 5      UA   1696  N39463    EWR   ORD              Chicago Ohare Intl
## 6      B6    507  N516JB    EWR   FLL  Fort Lauderdale Hollywood Intl
## Variables not shown: lat <dbl>, lon <dbl>.

join (flights)

# inner_join: return all rows from x where there are matching values in y, and all columns from x and y. 
Flights %>% inner_join(Airports, by=c("dest" = "faa")) %>% dim()

## [1] 329174     25

# left_join: return all rows from x, and all columns from x and y.
Flights %>% left_join(Airports, by=c("dest" = "faa")) %>% dim()

## [1] 336776     25

# right_join: return all rows from y, and all columns from x and y. 
Flights %>% right_join(Airports, by=c("dest" = "faa")) %>% dim()

## [1] 330469     25

# anti_join: return all rows from x where there are not matching values in y, keeping just columns from x.
Flights %>% anti_join(Airports, by=c("dest" = "faa")) %>% dim()

## [1] 7602   19

# full_join: return all rows and all columns from both x and y.
Flights %>% full_join(Airports, by=c("dest" = "faa")) %>% dim()

## [1] 338071     25

lubridate

require(lubridate)

rightnow <- now()
rightnow

## [1] "2016-06-22 15:32:10 PDT"

day(rightnow)

## [1] 22

week(rightnow)

## [1] 25

month(rightnow, label=FALSE)

## [1] 6

month(rightnow, label=TRUE)

## [1] Jun
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec

year(rightnow)

## [1] 2016

minute(rightnow)

## [1] 32

hour(rightnow)

## [1] 15

yday(rightnow)

## [1] 174

mday(rightnow)

## [1] 22

wday(rightnow, label=FALSE)

## [1] 4

wday(rightnow, label=TRUE)

## [1] Wed
## Levels: Sun < Mon < Tues < Wed < Thurs < Fri < Sat

lubridate

math with lubridate

jan31 <- ymd("2013-01-31")
jan31 + months(0:11)

##  [1] "2013-01-31" NA           "2013-03-31" NA           "2013-05-31"
##  [6] NA           "2013-07-31" "2013-08-31" NA           "2013-10-31"
## [11] NA           "2013-12-31"

floor_date(jan31, "month") # round down to the nearest month

## [1] "2013-01-01"

floor_date(jan31, "month") + months(0:11) + days(31)

##  [1] "2013-02-01" "2013-03-04" "2013-04-01" "2013-05-02" "2013-06-01"
##  [6] "2013-07-02" "2013-08-01" "2013-09-01" "2013-10-02" "2013-11-01"
## [11] "2013-12-02" "2014-01-01"

jan31 + months(0:11) + days(31)

##  [1] "2013-03-03" NA           "2013-05-01" NA           "2013-07-01"
##  [6] NA           "2013-08-31" "2013-10-01" NA           "2013-12-01"
## [11] NA           "2014-01-31"

#Add and subtract months to a date without exceeding the last day of the new month 
jan31 %m+% months(0:11)

##  [1] "2013-01-31" "2013-02-28" "2013-03-31" "2013-04-30" "2013-05-31"
##  [6] "2013-06-30" "2013-07-31" "2013-08-31" "2013-09-30" "2013-10-31"
## [11] "2013-11-30" "2013-12-31"

extract_numeric

require(tidyr)
extract_numeric("$1,200.34")

## [1] 1200.34

extract_numeric("-2%")

## [1] -2

# The heuristic is not perfect - it won't fail for things that
# clearly aren't numbers
extract_numeric("-2-2")

## [1] NA

extract_numeric("12abc34")

## [1] 1234

swirl

For a list of swirl courses, see: https://github.com/swirldev/swirl_courses

install.packages(“swirl”)
require(swirl) 
install_from_swirl("Getting and Cleaning Data")
swirl()