WHYY project data cleanup. Problem was inconsistent data in multiple sheets that needed to be combined into single data frame.

library(tidyverse)
library(readxl)
library(purrr)
library(janitor)

Import

This comes from read-xl workflows.

path <- "data-raw/insurance-births-year-hospital-philadelphia-correct-names-edit.xlsx"
sheets <- excel_sheets(path)
births <-
  lapply(excel_sheets(path), read_excel, path = path)
births <- lapply(seq_along(sheets), function(i) {
  data.frame(sheet = I(sheets[i]), births[[i]])
})
births <- do.call(rbind, births)

births <- births %>% clean_names()

births %>% glimpse()
## Rows: 154
## Columns: 8
## $ sheet              <I<chr>> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,…
## $ year               <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 20…
## $ hospital           <chr> "Albert Einstein", "Chestnut Hill", "Children's Ho…
## $ private            <chr> "308", "*", "323", "*", "133", "1685", "3562", "16…
## $ medicaid           <chr> "2440", "*", "66", "*", "381", "2391", "1586", "21…
## $ self_pay_uninsured <chr> "95", "*", "*", "*", "74", "26", "*", "86", "21", …
## $ other              <chr> "*", "*", "79", "*", "54", "*", "*", "39", "219", …
## $ unknown            <chr> "*", "*", "*", "*", "154", "155", "*", "27", "288"…

Cleaning

births <- births %>% 
  mutate(
    hospital = hospital %>% str_to_upper(),
    private = private %>% as.numeric(),
    medicaid = medicaid %>% as.numeric(),
    self_pay_uninsured = self_pay_uninsured %>% as.numeric(),
    other = other %>% as.numeric(),
    unknown = unknown %>% as.numeric(),
  )
## Warning in private %>% as.numeric(): NAs introduced by coercion
## Warning in medicaid %>% as.numeric(): NAs introduced by coercion
## Warning in self_pay_uninsured %>% as.numeric(): NAs introduced by coercion
## Warning in other %>% as.numeric(): NAs introduced by coercion
## Warning in unknown %>% as.numeric(): NAs introduced by coercion
births %>% glimpse()
## Rows: 154
## Columns: 8
## $ sheet              <I<chr>> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,…
## $ year               <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 20…
## $ hospital           <chr> "ALBERT EINSTEIN", "CHESTNUT HILL", "CHILDREN'S HO…
## $ private            <dbl> 308, NA, 323, NA, 133, 1685, 3562, 169, 988, 356, …
## $ medicaid           <dbl> 2440, NA, 66, NA, 381, 2391, 1586, 2114, 936, 2353…
## $ self_pay_uninsured <dbl> 95, NA, NA, NA, 74, 26, NA, 86, 21, 59, NA, NA, 18…
## $ other              <dbl> NA, NA, 79, NA, 54, NA, NA, 39, 219, 10, 81, NA, 1…
## $ unknown            <dbl> NA, NA, NA, NA, 154, 155, NA, 27, 288, NA, NA, NA,…

Export

births %>% write_excel_csv("data-processed/phi-ins-births.csv")