Can we distinguish names that have a clear gender preference?
GenderNames <-
group_by(BabyNames, name, sex) %>%
summarise(cnt=sum(count)) %>%
spread(key=sex, value=cnt, fill=0) %>%
filter((F+M)>10000, F > 10*M | M > 10*F) %>%
mutate(sex = ifelse(F>M, "female", "male"))
Some features
GenderFeatures <-
GenderNames %>%
ungroup %>%
mutate(name = as.character(name),
startV = grepl("^[aeiou]", name),
endV = grepl("[aeiou]&", name),
twoV = grepl("[aeiou]{2,}", name),
threeV = grepl("[aeiou].?[aeiou].?[aeiou]", name),
nchar = nchar(name),
hasb = grepl("[Bb]", name)
)
How well can we classify the sex based on these features?
mod <- rpart(sex ~ startV + endV + twoV + threeV + nchar + hasb,
data=GenderFeatures, cp = 0.001)
rpart.plot::prp(mod, extra = 2)
preds <- as.data.frame(predict(mod)) %>%
mutate(pred_sex = ifelse(female > male, "female", "male"))
counts(sex ~ preds$pred_sex, data = GenderFeatures)
## preds$pred_sex
## sex female male
## female 54448 2225
## male 28052 3197
logistic_mod <- glm(sex == "female" ~ startV*endV*twoV*threeV*nchar*hasb, data = GenderFeatures, family = "binomial")
pred_sex <- ifelse(predict(logistic_mod) > 0.5, "female", "male")
counts(sex ~ pred_sex, data = GenderFeatures)
## pred_sex
## sex female male
## female 41646 15027
## male 16165 15084
# svm_mod <- svm(sex ~ ., data = GenderFeatures)