WHYY project data cleanup. Problem was inconsistent data in multiple sheets that needed to be combined into single data frame.
library(tidyverse)
library(readxl)
library(purrr)
These fixes were done in Excel before processing:
The exported data still needs cleaning on hospital names and possibly county names.
This comes from read-xl workflows.
path <- "data-raw/births-by-hospital-cleaned.xlsx"
sheets <- excel_sheets(path)
births <-
lapply(excel_sheets(path), read_excel, path = path)
births <- lapply(seq_along(sheets), function(i) {
data.frame(sheet = I(sheets[i]), births[[i]])
})
births <- do.call(rbind, births)
births %>% glimpse()
## Rows: 2,472
## Columns: 7
## $ sheet <I<chr>> 2018-raw, 2018-raw, 2018-raw, 2018-raw, 2018-raw, …
## $ county <chr> "State Total", "Adams", "Allegheny", "Allegheny", "Al…
## $ hospital <chr> NA, "WellSpan Gettysburg Hospital", "Allegheny Genera…
## $ total <dbl> 135026, 464, 1, 1367, 722, 1028, 2, 1523, 1, 8537, 10…
## $ cesarean_total <dbl> 40758, 124, 0, 444, 295, 317, 0, 452, 1, 2484, 269, 0…
## $ cesarean_repeat <dbl> 16295, 50, 0, 170, 123, 136, 0, 174, 0, 244, 98, 0, 0…
## $ vbac <dbl> 3416, 2, 0, 28, 5, 33, 1, 13, 0, 87, 27, 0, 0, 135, 5…
Made county and hospital values uppercase. Pulled year from sheet name.
births <- births %>%
mutate(
county = county %>% str_to_upper(),
hospital = hospital %>% str_to_upper(),
year = sheet %>% str_sub(1,4)
) %>%
select(-sheet) %>%
select(year, everything())
births %>% glimpse()
## Rows: 2,472
## Columns: 7
## $ year <chr> "2018", "2018", "2018", "2018", "2018", "2018", "2018…
## $ county <chr> "STATE TOTAL", "ADAMS", "ALLEGHENY", "ALLEGHENY", "AL…
## $ hospital <chr> NA, "WELLSPAN GETTYSBURG HOSPITAL", "ALLEGHENY GENERA…
## $ total <dbl> 135026, 464, 1, 1367, 722, 1028, 2, 1523, 1, 8537, 10…
## $ cesarean_total <dbl> 40758, 124, 0, 444, 295, 317, 0, 452, 1, 2484, 269, 0…
## $ cesarean_repeat <dbl> 16295, 50, 0, 170, 123, 136, 0, 174, 0, 244, 98, 0, 0…
## $ vbac <dbl> 3416, 2, 0, 28, 5, 33, 1, 13, 0, 87, 27, 0, 0, 135, 5…
births %>% write_excel_csv("data-processed/penn-births.csv")
births %>%
filter(
county == "PHILADELPHIA"
) %>%
write_excel_csv("data-processed/phi-births.csv")