Miles Ott (with additions by Nicholas Horton)
CVC 2017
Keeping your data clean and tidy is an important step of every data project
Goal: learn how to take the data set you have and tidy it up to be the data set you want for your analyses
columns (variables).
The key is that every row is a case and every column is a variable.
No exceptions.
Wickham Tidy Data paper http://vita.had.co.nz/papers/tidy-data.html
The pipe syntax (%>%) takes a data frame (or data table) and sends it to the argument of a function.
x %>% f(y) is the same as f(x, y)
y %>% f(x, ., z) is the same as f(x,y,z)
object_name <-
data_table %>%
function_name(arguments) %>%
function_name(arguments)
in chaining, the value (on left) %>% is first argument to the function (on right)
Data verbs take data tables as input and give data tables as output
summarise(): computes summary statistics
filter(): subsets unwanted cases (rows)
arrange(): reorders the cases (rows)
select(): subsets variables (plus rename()
) (columns)
mutate(): transforms the variable (and transmute() like mutate, returns only new variables) (columns)
mosaic
plain template R Markdown file (File/New File/R Markdown/Template)ggplot2movies
and babynames
packages (the template already loads mosaic
)require(mosaic)
require(ggformula)
require(ggplot2movies) #gets us movies dataset
require(babynames) #gets us baby names data set
movies %>% nrow()
[1] 58788
movies %>% names()
[1] "title" "year" "length" "budget" "rating"
[6] "votes" "r1" "r2" "r3" "r4"
[11] "r5" "r6" "r7" "r8" "r9"
[16] "r10" "mpaa" "Action" "Animation" "Comedy"
[21] "Drama" "Documentary" "Romance" "Short"
movies %>% select(title, budget, length)
# A tibble: 58,788 x 3
title budget length
<chr> <int> <int>
1 $ NA 121
2 $1000 a Touchdown NA 71
3 $21 a Day Once a Month NA 7
4 $40,000 NA 70
5 $50,000 Climax Show, The NA 71
6 $pent NA 91
7 $windle NA 93
8 '15' NA 25
9 '38 NA 97
10 '49-'17 NA 61
# ... with 58,778 more rows
slim_movies <- movies %>% select(title, budget, length)
slim_movies %>% names()
[1] "title" "budget" "length"
slim_movies <- movies %>% select(., title, budget, length)
slim_movies %>% head()
# A tibble: 6 x 3
title budget length
<chr> <int> <int>
1 $ NA 121
2 $1000 a Touchdown NA 71
3 $21 a Day Once a Month NA 7
4 $40,000 NA 70
5 $50,000 Climax Show, The NA 71
6 $pent NA 91
slim_movies %>% tail()
# A tibble: 6 x 3
title budget length
<chr> <int> <int>
1 sIDney NA 15
2 tom thumb NA 98
3 www.XXX.com NA 105
4 www.hellssoapopera.com NA 100
5 xXx 85000000 132
6 xXx: State of the Union 87000000 101
slim_movies %>% sample_n(size = 6)
# A tibble: 6 x 3
title budget length
<chr> <int> <int>
1 Eroe dei nostri tempi, Un NA 85
2 Vsetko co mam rad NA 99
3 Gang Busters NA 74
4 Whistle Down the Wind NA 99
5 White Gold NA 91
6 Spy Kids 35000000 88
slim_movies %>% sample_frac(.00007)
# A tibble: 4 x 3
title budget length
<chr> <int> <int>
1 3 Ninjas Kick Back NA 93
2 Guns of Darkness NA 102
3 Three Wise Fools NA 90
4 Brother Tied NA 109
Back to slim_movies data
slim_movies <- slim_movies %>% filter(!is.na(budget), nchar(title) < 24)
mutate or transmute to create a new variable?
slim_movies %>% mutate(dpm = budget/length) %>% head(6)
# A tibble: 6 x 4
title budget length dpm
<chr> <int> <int> <dbl>
1 'G' Men 450000 85 5294.118
2 'Til There Was You 23000000 113 203539.823
3 .com for Murder 5000000 96 52083.333
4 100 Mile Rule 1100000 98 11224.490
5 100 Proof 140000 94 1489.362
6 101 200000 117 1709.402
mutate or transmute to create a new variable?
slim_movies %>% transmute(dpm = budget/length) %>% head(6)
# A tibble: 6 x 1
dpm
<dbl>
1 5294.118
2 203539.823
3 52083.333
4 11224.490
5 1489.362
6 1709.402
# number of movies (cases) in movie data
movies %>% summarise(n())
# A tibble: 1 x 1
`n()`
<int>
1 58788
# total number of minutes of all the movies
movies %>% summarise(sum(length, na.rm = TRUE))
# A tibble: 1 x 1
`sum(length, na.rm = TRUE)`
<int>
1 4840479
# mean of minutes of all the movies
movies %>% summarise(mean(length, na.rm = TRUE))
# A tibble: 1 x 1
`mean(length, na.rm = TRUE)`
<dbl>
1 82.33788
# mean dollars per minute for movies with budget data
movies %>%
mutate(dpm = budget / length) %>%
summarise(mean(dpm, na.rm = TRUE))
# A tibble: 1 x 1
`mean(dpm, na.rm = TRUE)`
<dbl>
1 126931.7
# mean length of movies in hours for all the movies, broken down by mpaa
movies %>%
mutate(hours = length / 60) %>%
group_by(mpaa) %>%
summarise(mean(hours, na.rm = TRUE))
# A tibble: 5 x 2
mpaa `mean(hours, na.rm = TRUE)`
<chr> <dbl>
1 1.344045
2 NC-17 1.836458
3 PG 1.623043
4 PG-13 1.749501
5 R 1.669500
# average length in hours, by mpaa rating, sorted by average length
movies %>%
mutate(hours = length / 60) %>%
group_by(mpaa) %>%
summarise(avelength = mean(hours, na.rm = TRUE)) %>%
arrange(avelength)
# A tibble: 5 x 2
mpaa avelength
<chr> <dbl>
1 1.344045
2 PG 1.623043
3 R 1.669500
4 PG-13 1.749501
5 NC-17 1.836458
When starting, it can be helpful to work with a small subset of the data. When you have your data wrangling statements in working order, shift to the entire data table.
SmallSubset <-
movies %>%
filter(year > 2000) %>%
filter(mpaa != "")
SmallSubset %>%
summarise(avg = ????(rating))
SmallSubset %>%
summarise(avg = mean(rating))
# A tibble: 1 x 1
avg
<dbl>
1 5.740306
SmallSubset %>%
group_by(????) %>%
summarise(avg = ????(rating))
SmallSubset %>%
group_by(mpaa) %>%
summarise(avg = mean(rating))
# A tibble: 4 x 2
mpaa avg
<chr> <dbl>
1 NC-17 4.750000
2 PG 5.892593
3 PG-13 5.903524
4 R 5.655228
SmallSubset %>%
group_by(????) %>%
summarise(Actioncount = sum(????))
SmallSubset %>%
group_by(year) %>%
summarise(Actioncount = sum(Action))
# A tibble: 5 x 2
year Actioncount
<int> <int>
1 2001 94
2 2002 95
3 2003 82
4 2004 60
5 2005 18
SmallSubset %>%
group_by(????, ????) %>%
summarise(????)
SmallSubset %>%
group_by(year, mpaa) %>%
summarise(ComedyCount = sum(Comedy))
# A tibble: 17 x 3
# Groups: year [?]
year mpaa ComedyCount
<int> <chr> <int>
1 2001 NC-17 0
2 2001 PG 27
3 2001 PG-13 53
4 2001 R 107
5 2002 PG 22
6 2002 PG-13 50
7 2002 R 95
8 2003 PG 27
9 2003 PG-13 56
10 2003 R 67
11 2004 PG 25
12 2004 PG-13 46
13 2004 R 59
14 2005 NC-17 0
15 2005 PG 9
16 2005 PG-13 14
17 2005 R 3
RMovies <-
movies %>%
filter(mpaa == "R") %>% # just the rated R movies
group_by(????) %>% # for each year for each movie title
summarise(mean_user_rating = ????)
RMovies <-
movies %>%
filter(mpaa == "R") %>% # just the rated R movies
group_by(year) %>% # for each year for each movie title
summarise(mean_user_rating = mean(rating))
Put year
on the x-axis and the mean_user_rating
for each R rated movie on the y-axis
gf_line(???? ~ ????, data = RMovies)
Put year
on the x-axis and the mean_user_rating
for each R rated movie on the y-axis
gf_line(year ~ mean_user_rating, data = RMovies)
Data verbs take data tables as input and give data tables as output
babynames %>% names()
[1] "year" "sex" "name" "n" "prop"
babynames %>% head()
# A tibble: 6 x 5
year sex name n prop
<dbl> <chr> <chr> <int> <dbl>
1 1880 F Mary 7065 0.07238433
2 1880 F Anna 2604 0.02667923
3 1880 F Emma 2003 0.02052170
4 1880 F Elizabeth 1939 0.01986599
5 1880 F Minnie 1746 0.01788861
6 1880 F Margaret 1578 0.01616737
babynames %>% glimpse()
Observations: 1,858,689
Variables: 5
$ year <dbl> 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 188...
$ sex <chr> "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F...
$ name <chr> "Mary", "Anna", "Emma", "Elizabeth", "Minnie", "Margaret"...
$ n <int> 7065, 2604, 2003, 1939, 1746, 1578, 1472, 1414, 1320, 128...
$ prop <dbl> 0.072384329, 0.026679234, 0.020521700, 0.019865989, 0.017...
babynames %>% inspect()
categorical variables:
name class levels n missing
1 sex character 2 1858689 0
2 name character 95025 1858689 0
distribution
1 F (59.2%), M (40.8%)
2 Francis (0%), James (0%) ...
quantitative variables:
name class min Q1 median Q3
1 year numeric 1.880000e+03 1.950000e+03 1.983000e+03 2.002000e+03
2 n integer 5.000000e+00 7.000000e+00 1.200000e+01 3.200000e+01
3 prop numeric 2.259872e-06 3.900959e-06 7.348183e-06 2.324258e-05
max mean sd n missing
1 2.01500e+03 1.973376e+03 33.6978783 1858689 0
2 9.96800e+04 1.833830e+02 1555.3570871 1858689 0
3 8.15463e-02 1.391443e-04 0.0011702 1858689 0
Note: prop
differs from what you think it might be.
Again let's work with a small subset of the data.
set.seed(1999)
smallbaby <-
babynames %>%
filter(year > 2000) %>%
sample_n(size = 300)
names(smallbaby)
[1] "year" "sex" "name" "n" "prop"
smallbaby %>%
summarise(total = ????(n)) # a reduction verb
smallbaby %>%
summarise(total = sum(n))
# A tibble: 1 x 1
total
<int>
1 19580
smallbaby %>%
group_by(????) %>%
summarise(total = ????(n))
smallbaby %>%
group_by(year) %>%
summarise(total = sum(n))
# A tibble: 15 x 2
year total
<dbl> <int>
1 2001 671
2 2002 1401
3 2003 237
4 2004 431
5 2005 669
6 2006 314
7 2007 1422
8 2008 1352
9 2009 1901
10 2010 541
11 2011 6733
12 2012 1627
13 2013 438
14 2014 1150
15 2015 693
smallbaby %>%
group_by(????) %>%
summarise(name_count = n_distinct(????))
smallbaby %>%
group_by(year) %>%
summarise(name_count = n_distinct(name), num = n()) # equivalent to n() here
# A tibble: 15 x 3
year name_count num
<dbl> <int> <int>
1 2001 10 10
2 2002 13 13
3 2003 10 10
4 2004 16 16
5 2005 12 12
6 2006 19 19
7 2007 30 30
8 2008 25 25
9 2009 28 28
10 2010 17 17
11 2011 23 23
12 2012 26 26
13 2013 23 23
14 2014 21 21
15 2015 27 27
smallbaby %>%
group_by(????, ????) %>%
summarise(????)
temp <- smallbaby %>%
group_by(sex) %>%
summarise(name_count = n_distinct(name), num = n()) # not equivalent
data.frame(temp)
sex name_count num
1 F 189 190
2 M 110 110
subset for babies named Nicholas
nicknames <- babynames %>% filter(name == "Nicholas")
gf_line(prop ~ year, data = nicknames)
nicknames_male <- babynames %>% filter(name == "Nicholas", sex == "M")
gf_line(prop ~ year, data = nicknames_male)
MaryJane <-
babynames %>%
????(name %in% c("Jane", "Mary")) %>% # just the Janes and Marys
group_by(????, ????) %>% # for each year for each name
summarise(count = ????)
MaryJane <-
babynames %>%
filter(name %in% c("Jane", "Mary")) %>% # just the Janes and Marys
group_by(name, year) %>% # for each year for each name
summarise(count = sum(n))
year
on the x-axis and the count of each name on the y-axis.gf_point(count ~ year, data = MaryJane)
gf_line()
.gf_line(???? ~ ????, color = ????, size = ????, data = MaryJane)
gf_line(count ~ year, color = ~ name, size = 2, data = MaryJane)
Filter first on female, then on Mary and Jane. Meanwhile, calculate the proportion of names in the dataset which are Mary and Jane (as opposed to the proportion of all babies from the given year which is the prop variable).
Result2 <-
babynames %>%
filter(sex == "F") %>%
group_by(year) %>%
mutate(total = ????(n)) %>%
filter(????) %>%
mutate(proportion = n / total)
Result2 <-
babynames %>%
filter(sex == "F") %>%
group_by(year) %>%
mutate(total = sum(n)) %>%
filter(name %in% c("Mary", "Jane")) %>%
mutate(proportion = n / total)
gf_line(???? ~ ????, color = ????, data = Result2)
gf_line(proportion ~ year, color = ~ name, data = Result2)
Pat <- babynames %>%
filter(name == "Pat")
gf_line(n ~ year, col = ~ sex, size = 1, data = Pat)