Jo Hardin
June 20, 2016
In conrast to (most) static data, dynamic data…
To bring dynamic data sets into the curriculum
… in order to answer interesting statistical questions
… and other scientific questions.
college_url <- "https://s3.amazonaws.com/ed-college-choice-public/Most+Recent+Cohorts+(All+Data+Elements).csv"
college_data <- read_csv(college_url)
dim(college_data)
## [1] 7804 1728
names(college_data)[33:40]
## [1] "WOMENONLY" "RELAFFIL" "ADM_RATE" "ADM_RATE_ALL"
## [5] "SATVR25" "SATVR75" "SATMT25" "SATMT75"
Very big data set, lots of cleaning to do.
college_debt = college_data %>%
select(INSTNM,STABBR,PREDDEG, HIGHDEG, region, LOCALE, CCUGPROF,HBCU,
WOMENONLY, RELAFFIL,ADM_RATE, SATVRMID, SATMTMID,SATWRMID,
SAT_AVG, UG,NPT4_PUB, NPT4_PRIV, COSTT4_A, DEBT_MDN, CUML_DEBT_P90,
mn_earn_wne_p10,md_earn_wne_p10) %>%
mutate(region2 = ifelse(region=="0", "Military",
ifelse(region=="1", "New England",
ifelse(region=="2", "Mid East",
ifelse(region=="3", "Great Lakes",
ifelse(region=="4", "Plains",
ifelse(region=="5", "Southeast",
ifelse(region=="6", "Southwest",
ifelse(region=="7", "Rocky Mnts",
ifelse(region=="8", "Far West", "Outlying")))))))))) %>%
mutate(ADM_RATE = extract_numeric(ADM_RATE),
SATVRMID = extract_numeric(SATVRMID),
SATMTMID = extract_numeric(SATMTMID),
SATWRMID = extract_numeric(SATWRMID),
SAT_AVG = extract_numeric(SAT_AVG),
UG = extract_numeric(UG),
NPT4_PUB = extract_numeric(NPT4_PUB),
NPT4_PRIV = extract_numeric(NPT4_PRIV),
COSTT4_A = extract_numeric(COSTT4_A),
DEBT_MDN = extract_numeric(DEBT_MDN),
CUML_DEBT_P90 = extract_numeric(CUML_DEBT_P90),
mn_earn_wne_p10 = extract_numeric(mn_earn_wne_p10),
md_earn_wne_p10 = extract_numeric(md_earn_wne_p10)) %>%
mutate(RELAFFIL = ifelse(RELAFFIL=="NULL", NA, RELAFFIL),
LOCALE = ifelse(LOCALE =="NULL", NA, LOCALE),
CCUGPROF = ifelse(CCUGPROF=="NULL", NA, CCUGPROF),
HBCU = ifelse(HBCU=="NULL", NA, HBCU),
WOMENONLY = ifelse(WOMENONLY=="NULL", NA, WOMENONLY))
str(college_debt)
## Classes 'tbl_df', 'tbl' and 'data.frame': 7804 obs. of 24 variables:
## $ INSTNM : chr "Alabama A & M University" "University of Alabama at Birmingham" "Amridge University" "University of Alabama in Huntsville" ...
## $ STABBR : chr "AL" "AL" "AL" "AL" ...
## $ PREDDEG : int 3 3 3 3 3 3 2 3 3 3 ...
## $ HIGHDEG : int 4 4 4 4 4 4 2 3 4 4 ...
## $ region : int 5 5 5 5 5 5 5 5 5 5 ...
## $ LOCALE : chr "12" "12" "12" "12" ...
## $ CCUGPROF : chr "9" "8" "6" "8" ...
## $ HBCU : chr "1" "0" "0" "0" ...
## $ WOMENONLY : chr "0" "0" "0" "0" ...
## $ RELAFFIL : chr NA NA "74" NA ...
## $ ADM_RATE : num 0.899 0.867 NA 0.806 0.512 ...
## $ SATVRMID : num 410 580 NA 575 430 555 NA NA NA 570 ...
## $ SATMTMID : num 400 585 NA 580 425 570 NA NA NA 595 ...
## $ SATWRMID : num NA NA NA NA NA 540 NA NA NA 565 ...
## $ SAT_AVG : num 823 1146 NA 1180 830 ...
## $ UG : num 4380 10331 98 5220 4348 ...
## $ NPT4_PUB : num 13415 14805 NA 17520 11936 ...
## $ NPT4_PRIV : num NA NA 7455 NA NA ...
## $ COSTT4_A : num 18888 19990 12300 20306 17400 ...
## $ DEBT_MDN : num 19500 16250 10500 16500 15854 ...
## $ CUML_DEBT_P90 : num 50114 40000 40000 40750 45846 ...
## $ mn_earn_wne_p10: num 35300 46300 42100 52700 30700 49100 31400 41500 36700 52100 ...
## $ md_earn_wne_p10: num 31400 40300 38100 46600 27800 42400 27100 39700 34800 45400 ...
## $ region2 : chr "Southeast" "Southeast" "Southeast" "Southeast" ...
summary(college_debt)
## INSTNM STABBR PREDDEG HIGHDEG
## Length:7804 Length:7804 Min. :0.000 Min. :0.000
## Class :character Class :character 1st Qu.:1.000 1st Qu.:1.000
## Mode :character Mode :character Median :2.000 Median :2.000
## Mean :1.789 Mean :2.176
## 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :4.000 Max. :4.000
##
## region LOCALE CCUGPROF HBCU
## Min. :0.000 Length:7804 Length:7804 Length:7804
## 1st Qu.:3.000 Class :character Class :character Class :character
## Median :5.000 Mode :character Mode :character Mode :character
## Mean :4.621
## 3rd Qu.:6.000
## Max. :9.000
##
## WOMENONLY RELAFFIL ADM_RATE SATVRMID
## Length:7804 Length:7804 Min. :0.000 Min. :290.0
## Class :character Class :character 1st Qu.:0.552 1st Qu.:475.0
## Mode :character Mode :character Median :0.700 Median :515.0
## Mean :0.682 Mean :521.8
## 3rd Qu.:0.834 3rd Qu.:555.0
## Max. :1.000 Max. :760.0
## NA's :5584 NA's :6503
## SATMTMID SATWRMID SAT_AVG UG
## Min. :310.0 Min. :350.0 Min. : 666.0 Min. : 0
## 1st Qu.:483.0 1st Qu.:470.0 1st Qu.: 971.8 1st Qu.: 137
## Median :520.0 Median :510.0 Median :1036.5 Median : 754
## Mean :530.8 Mean :521.2 Mean :1056.7 Mean : 2648
## 3rd Qu.:565.0 3rd Qu.:559.0 3rd Qu.:1117.2 3rd Qu.: 2785
## Max. :785.0 Max. :755.0 Max. :1534.0 Max. :46834
## NA's :6489 NA's :7011 NA's :6384 NA's :2848
## NPT4_PUB NPT4_PRIV COSTT4_A DEBT_MDN
## Min. :-1643 Min. :-1220 Min. : 4157 Min. : 333
## 1st Qu.: 6320 1st Qu.:13132 1st Qu.:14143 1st Qu.: 7710
## Median : 8792 Median :18259 Median :22865 Median : 9833
## Mean : 9584 Mean :18072 Mean :24354 Mean : 11830
## 3rd Qu.:12480 3rd Qu.:22485 3rd Qu.:30383 3rd Qu.: 15462
## Max. :27199 Max. :87570 Max. :74473 Max. :131335
## NA's :5881 NA's :3051 NA's :3667 NA's :1163
## CUML_DEBT_P90 mn_earn_wne_p10 md_earn_wne_p10 region2
## Min. : 333 Min. : 12300 Min. : 8400 Length:7804
## 1st Qu.: 14750 1st Qu.: 27300 1st Qu.: 24200 Class :character
## Median : 24317 Median : 34500 Median : 31200 Mode :character
## Mean : 25147 Mean : 37184 Mean : 33233
## 3rd Qu.: 33798 3rd Qu.: 43300 3rd Qu.: 39200
## Max. :131335 Max. :250000 Max. :250000
## NA's :1586 NA's :2168 NA's :2168
The NHANES data (from the National Health and Nutrition Examination Survey - nationwide survey of CDC)
require(Hmisc)
NHANES.demo <- sasxport.get("http://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DEMO_G.XPT")
## Processing SAS dataset DEMO_G ..
NHANES.body <- sasxport.get("http://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/BMX_G.XPT")
## Processing SAS dataset BMX_G ..
NHANES.demo <-
mutate(NHANES.demo, gender = ifelse(NHANES.demo$riagendr==1, "male", "female"))
NHANES.comb <-
inner_join(NHANES.body, NHANES.demo, by = "seqn")
head(NHANES.comb)
## seqn bmdstats bmxwt bmiwt bmxrecum bmirecum bmxhead bmihead bmxht bmiht
## 1 62161 1 69.2 NA NA NA NA NA 172.3 NA
## 2 62162 1 12.7 NA 95.7 NA NA NA 94.7 NA
## 3 62163 1 49.4 NA NA NA NA NA 168.9 NA
## 4 62164 1 67.2 NA NA NA NA NA 170.1 NA
## 5 62165 1 69.1 NA NA NA NA NA 159.4 NA
## 6 62166 1 28.8 NA NA NA NA NA 133.4 NA
## bmxbmi bmdbmic bmxleg bmileg bmxarml bmiarml bmxarmc bmiarmc bmxwaist
## 1 23.3 NA 40.2 NA 35.0 NA 32.5 NA 81.0
## 2 14.2 2 NA NA 18.5 NA 16.6 NA 45.4
## 3 17.3 2 40.3 NA 36.3 NA 22.0 NA 64.6
## 4 23.2 NA 40.5 NA 37.2 NA 29.3 NA 80.1
## 5 27.2 3 42.1 NA 35.2 NA 29.7 NA 86.7
## 6 16.2 2 31.0 NA 28.0 NA 19.1 NA 59.8
## bmiwaist bmxsad1 bmxsad2 bmxsad3 bmxsad4 bmdavsad bmdsadcm sddsrvyr
## 1 NA 17.7 17.9 NA NA 17.8 NA 7
## 2 NA NA NA NA NA NA NA 7
## 3 NA 15.6 15.5 NA NA 15.6 NA 7
## 4 NA 18.3 18.5 NA NA 18.4 NA 7
## 5 NA 21.0 20.8 NA NA 20.9 NA 7
## 6 NA 13.5 13.5 NA NA 13.5 NA 7
## ridstatr riagendr ridageyr ridagemn ridreth1 ridreth3 ridexmon ridexagy
## 1 2 1 22 NA 3 3 2 NA
## 2 2 2 3 NA 1 1 1 3
## 3 2 1 14 NA 5 6 2 14
## 4 2 2 44 NA 3 3 1 NA
## 5 2 2 14 NA 4 4 2 14
## 6 2 1 9 NA 3 3 2 10
## ridexagm dmqmiliz dmqadfc dmdborn4 dmdcitzn dmdyrsus dmdeduc3 dmdeduc2
## 1 NA 2 NA 1 1 NA NA 3
## 2 41 NA NA 1 1 NA NA NA
## 3 177 NA NA 1 1 NA 8 NA
## 4 NA 1 2 1 1 NA NA 4
## 5 179 NA NA 1 1 NA 7 NA
## 6 120 NA NA 1 1 NA 3 NA
## dmdmartl ridexprg sialang siaproxy siaintrp fialang fiaproxy fiaintrp
## 1 5 NA 1 1 2 1 2 2
## 2 NA NA 1 1 2 1 2 2
## 3 NA NA 1 1 2 1 2 2
## 4 1 2 1 2 2 1 2 2
## 5 NA NA 1 1 2 1 2 2
## 6 NA NA 1 1 2 1 2 2
## mialang miaproxy miaintrp aialanga wtint2yr wtmec2yr sdmvpsu
## 1 1 2 2 1 102641.406 104236.583 1
## 2 NA NA NA NA 15457.737 16116.354 3
## 3 1 2 2 1 7397.685 7869.485 3
## 4 NA NA NA NA 127351.373 127965.226 1
## 5 1 2 2 1 12209.745 13384.042 2
## 6 1 2 2 NA 60593.637 64068.123 1
## sdmvstra indhhin2 indfmin2 indfmpir dmdhhsiz dmdfmsiz dmdhhsza dmdhhszb
## 1 91 14 14 3.15 5 5 0 1
## 2 92 4 4 0.60 6 6 2 2
## 3 90 15 15 4.07 5 5 0 2
## 4 94 8 8 1.67 5 5 1 2
## 5 90 4 4 0.57 5 5 1 2
## 6 91 77 77 NA 6 6 0 4
## dmdhhsze dmdhrgnd dmdhrage dmdhrbr4 dmdhredu dmdhrmar dmdhsedu gender
## 1 0 2 50 1 5 1 5 male
## 2 0 2 24 1 3 6 NA female
## 3 1 1 42 1 5 1 4 male
## 4 0 1 52 1 4 1 4 female
## 5 0 2 33 2 2 77 NA female
## 6 0 1 44 1 5 1 5 male
Because the NHANES data were collected using a cluster sampling scheme, it is important to use the variables which describe the weights on the sampling to create a sample which is reflective of the population.
numobs = 5000
SRSsample <- sample(1:nrow(NHANES.comb), numobs, replace=FALSE,
prob=NHANES.comb$wtmec2yr/sum(NHANES.comb$wtmec2yr))
NHANES.comb <- NHANES.comb[SRSsample,]
Adults = NHANES.comb %>%
filter(ridageyr >=18, bmxbmi>1) %>%
filter(dmdmartl>0 & dmdmartl < 10) %>%
mutate(rel=ifelse(dmdmartl==6|dmdmartl==1, "committed", "not")) %>%
mutate(bmi=bmxbmi)
boxplot(bmi ~ rel, data=Adults, xlab="Relationship Status", ylab="BMI")
ggplot(Adults, aes(rel, bmi))+ geom_violin(color="orange")+
xlab("Relationship Status") + ylab("BMI")
dim(Adults)
## [1] 3221 76
t.test(bmi ~ rel, data=Adults)
##
## Welch Two Sample t-test
##
## data: bmi by rel
## t = 0.48029, df = 2835.6, p-value = 0.6311
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3574424 0.5893576
## sample estimates:
## mean in group committed mean in group not
## 28.82170 28.70574
In a linear models or computational statistics class, smooth curves might be a topic of discussion:
ggplot(Adults, aes(x=bmxht, y=bmxwt, group=gender, color=gender)) + geom_point(alpha=.5)+
xlab("Height") + ylab("Weight") + ggtitle("Height vs Weight by Gender")
ggplot(Adults, aes(x=bmxht, y=bmxwt, group=gender, color=gender)) +
xlab("Height") + ylab("Weight") + geom_point(alpha=.5)+
stat_smooth(alpha=1)+
ggtitle("Height vs Weight by Gender with Smooth Regression Fit")
Jo Hardin
Pomona College
jo.hardin@pomona.edu