Classifying BabyNames

Can we distinguish names that have a clear gender preference?

GenderNames <-
  group_by(BabyNames, name, sex) %>% 
  summarise(cnt=sum(count)) %>% 
  spread(key=sex, value=cnt, fill=0) %>%
  filter((F+M)>10000, F > 10*M | M > 10*F) %>%
  mutate(sex = ifelse(F>M, "female", "male"))

Some features

GenderFeatures <-
  GenderNames %>%
  ungroup %>%
  mutate(name = as.character(name),
         startV = grepl("^[aeiou]", name),
         endV = grepl("[aeiou]&", name),
         twoV = grepl("[aeiou]{2,}", name),
         threeV = grepl("[aeiou].?[aeiou].?[aeiou]", name),
         nchar = nchar(name),
         hasb = grepl("[Bb]", name)
         )

How well can we classify the sex based on these features?

mod <- rpart(sex ~ startV + endV + twoV + threeV + nchar + hasb, 
             data=GenderFeatures, cp = 0.001)
rpart.plot::prp(mod, extra = 2)

preds <- as.data.frame(predict(mod)) %>%
  mutate(pred_sex = ifelse(female > male, "female", "male"))
counts(sex  ~ preds$pred_sex, data = GenderFeatures)
##         preds$pred_sex
## sex      female  male
##   female  54448  2225
##   male    28052  3197
logistic_mod <- glm(sex == "female" ~ startV*endV*twoV*threeV*nchar*hasb, data = GenderFeatures, family = "binomial")
pred_sex <- ifelse(predict(logistic_mod) > 0.5, "female", "male")
counts(sex ~ pred_sex, data = GenderFeatures)
##         pred_sex
## sex      female  male
##   female  41646 15027
##   male    16165 15084
# svm_mod <- svm(sex ~ ., data = GenderFeatures)