require(babynames)
require(mosaic)
Babynames = babynames
Each year, the US Social Security Administration publishes a list of the most popular names given to babies. In 2014, the list shows Emma and Olivia leading for girls, Noah and Liam for boys.
The babynames
data table in the babynames
package comes from the Social Security Administration’s listing of the names givens to babies in each year, and the number of babies of each sex given that name. (Only names with 5 or more babies are published by the SSA.)
A few simple questions about the data.
When starting, it can be helpful to work with a small subset of the data. When you have your data wrangling statements in working order, shift to the entire data table.
SmallSubset <-
Babynames %>%
filter(year > 2000) %>%
sample_n(size = 200)
Note: Chunks in this template are headed with {r eval=FALSE}
. Change this to {r}
when you are ready to compile
SmallSubset %>%
summarise(total = ????(n)) # a reduction verb
SmallSubset %>%
group_by(????) %>%
summarise(total = ????(n))
SmallSubset %>%
group_by(????) %>%
summarise(name_count = n_distinct(????))
SmallSubset %>%
group_by(????, ????) %>%
summarise(????)
Result <-
Babynames %>%
????(name %in% c("Jane", "Mary")) %>% # just the Janes and Marys
group_by(????, ????) %>% # for each year for each name
summarise(count = ????)
Put year
on the x-axis and the count of each name on the y-axis. Note that ggplot()
commands use +
rather than %>%
.
ggplot(data=Result, aes(x = year, y = count)) +
geom_point()
aes()
function.geom_line()
.+ ylab("Yearly Births")
size=2
. Remember that “setting” refers to adjusting the value of an aesthetic to a constant. Thus, it’s outside the aes()
function.Result2 <-
Babynames %>%
group_by(year) %>%
mutate(total = ????(n)) %>%
filter(????) %>%
mutate(proportion = n / total)
sex
a variable in Result2
? Eliminate it, keeping just the girls. Note: It would likely be better to add up the boys and girls, but this is surprisingly hard. It becomes much easier once you have another data verb to work with: inner_join()
.filter()
step is put before the mutate()
step?Just as you did with count vs year, graph proportion vs year.
Result2 %>%
Your ggplot statements go here!
geom_vline()
.Plot out their popularity over time.
require(googlesheets)
require(tidyr)
gather the military paygrade data: https://docs.google.com/spreadsheets/d/1Ow6Cm4z-Z1Yybk3i352msulYCEDOUaOghmo9ALajyHo/edit#gid=1811988794
navy.url = "https://docs.google.com/spreadsheets/d/1Ow6Cm4z-Z1Yybk3i352msulYCEDOUaOghmo9ALajyHo/edit#gid=1877566408"
navy.temp = gs_url(navy.url, visibility="public")
## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.
## Putative key: 1Ow6Cm4z-Z1Yybk3i352msulYCEDOUaOghmo9ALajyHo
## Worksheets feed constructed with public visibility
Navy = gs_read(navy.temp)
## Accessing worksheet titled 'Navy'.
## No encoding supplied: defaulting to UTF-8.
glimpse(Navy)
## Observations: 37
## Variables: 17
## $ X1 (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ Active Duty Family (chr) NA, "Marital Status Report", NA, "Data Refl...
## $ X3 (chr) NA, NA, NA, NA, NA, NA, "Single Without Chi...
## $ X4 (chr) NA, NA, NA, NA, NA, NA, NA, "Female", "2,27...
## $ X5 (chr) NA, NA, NA, NA, NA, NA, NA, "Total", "10,09...
## $ X6 (chr) NA, NA, NA, NA, NA, NA, "Single With Childr...
## $ X7 (chr) NA, NA, NA, NA, NA, NA, NA, "Female", "34",...
## $ X8 (chr) NA, NA, NA, NA, NA, NA, NA, "Total", "151",...
## $ X9 (chr) NA, NA, NA, NA, NA, NA, "Joint Service Marr...
## $ X10 (chr) NA, NA, NA, NA, NA, NA, NA, "Female", "57",...
## $ X11 (chr) NA, NA, NA, NA, NA, NA, NA, "Total", "87", ...
## $ X12 (chr) NA, NA, NA, NA, NA, NA, "Civilian Marriage"...
## $ X13 (chr) NA, NA, NA, NA, NA, NA, NA, "Female", "162"...
## $ X14 (chr) NA, NA, NA, NA, NA, NA, NA, "Total", "968",...
## $ X15 (chr) NA, NA, NA, NA, NA, NA, "Total", "Male", "8...
## $ X16 (chr) NA, NA, NA, NA, NA, NA, NA, "Female", "2,52...
## $ X17 (chr) NA, NA, NA, NA, NA, NA, NA, "Total", "11,30...
names(Navy) = c("X","pay.grade", "male.sing.wo", "female.sing.wo", "tot.sing.wo",
"male.sing.w", "female.sing.w", "tot.sing.w", "male.joint.NA",
"female.joint.NA", "tot.joint.NA", "male.civ.NA", "female.civ.NA",
"tot.civ.NA", "male.tot.NA", "female.tot.NA", "tot.tot.NA")
Navy = Navy[-c(1:8), -1]
# get rid of total columns & rows:
NavyWR <-
Navy %>%
select(-contains("tot")) %>%
filter(substr(pay.grade, 1, 5) != "TOTAL" & substr(pay.grade, 1, 5) != "GRAND" ) %>%
gather(status,numPeople,-pay.grade) %>%
separate(status, into=c("sex", "marital", "kids", sep=".")) %>%
select(c(1:4,6)) %>% mutate(count=extract_numeric(numPeople))
## Warning: Too few values at 184 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
NavyWR %>% head()
## Source: local data frame [6 x 6]
##
## pay.grade sex marital kids numPeople count
## (chr) (chr) (chr) (chr) (chr) (dbl)
## 1 E-1 male sing wo 7,820 7820
## 2 E-2 male sing wo 11,198 11198
## 3 E-3 male sing wo 28,163 28163
## 4 E-4 male sing wo 23,285 23285
## 5 E-5 male sing wo 18,856 18856
## 6 E-6 male sing wo 5,917 5917
Does a graph tell us if we did it right? what if we had done it wrong…?
NavyWR %>% ggplot(aes(x=pay.grade, y=count, color=sex)) +
geom_point() +
facet_grid(kids~marital)
The suite of join functions in dplyr allows for combining two different data tables by matching one or more variables. A great cheatsheet on joining: http://stat545.com/bit001_dplyr-cheatsheet.html
Using nycflights13: Airline on-time data for all flights departing NYC in 2013. Also includes useful ‘metadata’ on airlines, airports, weather, and planes.
require(nycflights13)
names(flights)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour"
Flights <- flights
Airports <- airports
Airlines <- airlines
Weather <- weather
Planes <- planes
names(Flights)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour"
names(Airports)
## [1] "faa" "name" "lat" "lon" "alt" "tz" "dst"
names(Airlines)
## [1] "carrier" "name"
names(Weather)
## [1] "origin" "year" "month" "day" "hour"
## [6] "temp" "dewp" "humid" "wind_dir" "wind_speed"
## [11] "wind_gust" "precip" "pressure" "visib" "time_hour"
names(Planes)
## [1] "tailnum" "year" "type" "manufacturer"
## [5] "model" "engines" "seats" "speed"
## [9] "engine"
Flights %>% select(carrier, flight, tailnum, origin, dest) %>% head()
## Source: local data frame [6 x 5]
##
## carrier flight tailnum origin dest
## (chr) (int) (chr) (chr) (chr)
## 1 UA 1545 N14228 EWR IAH
## 2 UA 1714 N24211 LGA IAH
## 3 AA 1141 N619AA JFK MIA
## 4 B6 725 N804JB JFK BQN
## 5 DL 461 N668DN LGA ATL
## 6 UA 1696 N39463 EWR ORD
Airports %>% select(faa, name, lat, lon) %>% head()
## Source: local data frame [6 x 4]
##
## faa name lat lon
## (chr) (chr) (dbl) (dbl)
## 1 04G Lansdowne Airport 41.13047 -80.61958
## 2 06A Moton Field Municipal Airport 32.46057 -85.68003
## 3 06C Schaumburg Regional 41.98934 -88.10124
## 4 06N Randall Airport 41.43191 -74.39156
## 5 09J Jekyll Island Airport 31.07447 -81.42778
## 6 0A9 Elizabethton Municipal Airport 36.37122 -82.17342
Airports %>% select(faa, name, lat, lon) %>% sample_n(5)
## Source: local data frame [5 x 4]
##
## faa name lat lon
## (chr) (chr) (dbl) (dbl)
## 1 IDA Idaho Falls Rgnl 43.51456 -112.0708
## 2 EUG Mahlon Sweet Fld 44.12458 -123.2120
## 3 EAU Chippewa Valley Regional Airport 44.86580 -91.4843
## 4 CKB Harrison Marion Regional Airport 39.29660 -80.2281
## 5 NGZ NAS Alameda 37.78610 -122.3186
Flights %>%
inner_join(Airports, by=c("dest" = "faa")) %>%
select(carrier, flight, tailnum, origin, dest, name, lat, lon) %>%
head()
## Source: local data frame [6 x 8]
##
## carrier flight tailnum origin dest name
## (chr) (int) (chr) (chr) (chr) (chr)
## 1 UA 1545 N14228 EWR IAH George Bush Intercontinental
## 2 UA 1714 N24211 LGA IAH George Bush Intercontinental
## 3 AA 1141 N619AA JFK MIA Miami Intl
## 4 DL 461 N668DN LGA ATL Hartsfield Jackson Atlanta Intl
## 5 UA 1696 N39463 EWR ORD Chicago Ohare Intl
## 6 B6 507 N516JB EWR FLL Fort Lauderdale Hollywood Intl
## Variables not shown: lat (dbl), lon (dbl)
Flights %>% inner_join(Airports, by=c("dest" = "faa")) %>% dim()
## [1] 329174 25
Flights %>% left_join(Airports, by=c("dest" = "faa")) %>% dim()
## [1] 336776 25
Flights %>% right_join(Airports, by=c("dest" = "faa")) %>% dim()
## [1] 330469 25
Flights %>% anti_join(Airports, by=c("dest" = "faa")) %>% dim()
## [1] 7602 19
Fun with dates!
require(lubridate)
rightnow <- now()
day(rightnow)
## [1] 17
week(rightnow)
## [1] 25
month(rightnow, label=FALSE)
## [1] 6
month(rightnow, label=TRUE)
## [1] Jun
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
year(rightnow)
## [1] 2016
minute(rightnow)
## [1] 55
hour(rightnow)
## [1] 10
yday(rightnow)
## [1] 169
mday(rightnow)
## [1] 17
wday(rightnow, label=FALSE)
## [1] 6
wday(rightnow, label=TRUE)
## [1] Fri
## Levels: Sun < Mon < Tues < Wed < Thurs < Fri < Sat
But how do I create a date object?
jan31 <- ymd("2013-01-31")
jan31 + months(0:11)
## [1] "2013-01-31" NA "2013-03-31" NA "2013-05-31"
## [6] NA "2013-07-31" "2013-08-31" NA "2013-10-31"
## [11] NA "2013-12-31"
floor_date(jan31, "month") # round down to the nearest month
## [1] "2013-01-01"
floor_date(jan31, "month") + months(0:11) + days(31)
## [1] "2013-02-01" "2013-03-04" "2013-04-01" "2013-05-02" "2013-06-01"
## [6] "2013-07-02" "2013-08-01" "2013-09-01" "2013-10-02" "2013-11-01"
## [11] "2013-12-02" "2014-01-01"
jan31 + months(0:11) + days(31)
## [1] "2013-03-03" NA "2013-05-01" NA "2013-07-01"
## [6] NA "2013-08-31" "2013-10-01" NA "2013-12-01"
## [11] NA "2014-01-31"
jan31 %m+% months(0:11)
## [1] "2013-01-31" "2013-02-28" "2013-03-31" "2013-04-30" "2013-05-31"
## [6] "2013-06-30" "2013-07-31" "2013-08-31" "2013-09-30" "2013-10-31"
## [11] "2013-11-30" "2013-12-31"
FlightsWK <-
Flights %>%
mutate(ymdday = ymd(paste(year,"-", month,"-",day))) %>%
mutate(weekdy = wday(ymdday, label=TRUE), whichweek = week(ymdday))
FlightsWK %>%
select(year, month, day, ymdday, weekdy, whichweek, dep_time,
arr_time, air_time) %>%
head()
## Source: local data frame [6 x 9]
##
## year month day ymdday weekdy whichweek dep_time arr_time air_time
## (int) (int) (int) (date) (fctr) (dbl) (int) (int) (dbl)
## 1 2013 1 1 2013-01-01 Tues 1 517 830 227
## 2 2013 1 1 2013-01-01 Tues 1 533 850 227
## 3 2013 1 1 2013-01-01 Tues 1 542 923 160
## 4 2013 1 1 2013-01-01 Tues 1 544 1004 183
## 5 2013 1 1 2013-01-01 Tues 1 554 812 116
## 6 2013 1 1 2013-01-01 Tues 1 554 740 150
(Many of these problems were taken from Modern Data Science with R by Baumer, Kaplan, and Horton, in preparation.)
What plane (specified by the tailnum variable) traveled the most times from New York City airports in 2013? Plot the number of trips per week over the year (for that plane).