library(tidyverse)
library(janitor)
Solutions Day 1
This is the notebook fellows will work through on the first day of the Center for Health Journalism Hands-On R course.
You will be using daily weather summaries that have been downloaded from Climate Data Online. The explanations use Texas, but there are files for Arkansas, California, New York and North Carolina for practice.
Goals
Our goals are to:
- Import our data.
- Check all the column data types.
- Add some new columns based on the date.
- Recode some values in our data.
- Remove some unnecessary variables/columns.
- Export our cleaned data.
Setup
Add the entire code block for libraries.
Import
Follow the directions in the lesson to import the Texas data, starting with adding a new code block:
<- read_csv("data-raw/tx.csv") |> clean_names() tx_raw
Rows: 94503 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): STATION, NAME
dbl (7): PRCP, SNOW, SNWD, TAVG, TMAX, TMIN, TOBS
date (1): DATE
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tx_raw
OYO: Import a different state
Go through all the steps above, but with different a different state.
Peeking at data
Use head, tail, glimpse and summary to look at the Texas data.
Look at the top of your data:
|> head() tx_raw
Look at 8 lines of the bottom of your data:
|> tail(8) tx_raw
Use glimpse to see all your columns:
|> glimpse() tx_raw
Rows: 94,503
Columns: 10
$ station <chr> "USW00012918", "USW00012918", "USW00012918", "USW00012918", "U…
$ name <chr> "HOUSTON WILLIAM P HOBBY AIRPORT, TX US", "HOUSTON WILLIAM P H…
$ date <date> 1930-08-01, 1930-08-02, 1930-08-03, 1930-08-04, 1930-08-05, 1…
$ prcp <dbl> 3.00, 0.09, NA, 0.02, 0.12, NA, NA, NA, 0.00, 0.00, 0.00, NA, …
$ snow <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ snwd <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ tavg <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ tmax <dbl> 99, 97, 95, 95, 92, 92, 96, 97, 94, 92, 99, 99, 98, 98, 98, 97…
$ tmin <dbl> 75, 79, 78, 79, 76, 74, 71, 71, 75, 72, 70, 71, 78, 72, 73, 70…
$ tobs <dbl> 86, 89, 89, 85, 83, 89, 83, 82, 85, 85, 87, 86, 86, 85, 86, 84…
Use summary to learn about all your variables:
|> summary() tx_raw
station name date prcp
Length:94503 Length:94503 Min. :1930-08-01 Min. : 0.0000
Class :character Class :character 1st Qu.:1959-01-12 1st Qu.: 0.0000
Mode :character Mode :character Median :1980-08-05 Median : 0.0000
Mean :1980-06-04 Mean : 0.1121
3rd Qu.:2002-03-09 3rd Qu.: 0.0000
Max. :2023-09-30 Max. :12.0700
NA's :1867
snow snwd tavg tmax
Min. :0.000 Min. :0.000 Min. : 0.0 Min. : 13.00
1st Qu.:0.000 1st Qu.:0.000 1st Qu.:60.0 1st Qu.: 69.00
Median :0.000 Median :0.000 Median :73.0 Median : 81.00
Mean :0.003 Mean :0.004 Mean :70.2 Mean : 78.56
3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:82.0 3rd Qu.: 91.00
Max. :7.800 Max. :7.000 Max. :98.0 Max. :112.00
NA's :15369 NA's :15463 NA's :78843 NA's :16
tmin tobs
Min. :-2.00 Min. :24.00
1st Qu.:47.00 1st Qu.:61.00
Median :61.00 Median :72.00
Mean :58.65 Mean :69.65
3rd Qu.:72.00 3rd Qu.:80.00
Max. :93.00 Max. :99.00
NA's :16 NA's :91914
$date |> summary() tx_raw
Min. 1st Qu. Median Mean 3rd Qu. Max.
"1930-08-01" "1959-01-12" "1980-08-05" "1980-06-04" "2002-03-09" "2023-09-30"
OYO: Peek at your state’s data
Create or change data
Create year, month values based on the date.
<- tx_raw |>
tx_dates mutate(
yr = year(date),
mn = month(date, label = TRUE),
yd = yday(date)
)
|> glimpse() tx_dates
Rows: 94,503
Columns: 13
$ station <chr> "USW00012918", "USW00012918", "USW00012918", "USW00012918", "U…
$ name <chr> "HOUSTON WILLIAM P HOBBY AIRPORT, TX US", "HOUSTON WILLIAM P H…
$ date <date> 1930-08-01, 1930-08-02, 1930-08-03, 1930-08-04, 1930-08-05, 1…
$ prcp <dbl> 3.00, 0.09, NA, 0.02, 0.12, NA, NA, NA, 0.00, 0.00, 0.00, NA, …
$ snow <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ snwd <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ tavg <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ tmax <dbl> 99, 97, 95, 95, 92, 92, 96, 97, 94, 92, 99, 99, 98, 98, 98, 97…
$ tmin <dbl> 75, 79, 78, 79, 76, 74, 71, 71, 75, 72, 70, 71, 78, 72, 73, 70…
$ tobs <dbl> 86, 89, 89, 85, 83, 89, 83, 82, 85, 85, 87, 86, 86, 85, 86, 84…
$ yr <dbl> 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 19…
$ mn <ord> Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Au…
$ yd <dbl> 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 22…
OYO: Make date parts
Make the same date parts, but with your own state data:
Recoding values
Use distinct so you can see the station names:
|> distinct(name) tx_dates
Use mutate to recode
Use recode to create a new column of short city names:
<- tx_dates |>
tx_names mutate(
city = recode(
name,"HOUSTON WILLIAM P HOBBY AIRPORT, TX US" = "Houston",
"AUSTIN CAMP MABRY, TX US" = "Austin",
"DALLAS FAA AIRPORT, TX US" = "Dallas"
)
)
|> glimpse() tx_names
Rows: 94,503
Columns: 14
$ station <chr> "USW00012918", "USW00012918", "USW00012918", "USW00012918", "U…
$ name <chr> "HOUSTON WILLIAM P HOBBY AIRPORT, TX US", "HOUSTON WILLIAM P H…
$ date <date> 1930-08-01, 1930-08-02, 1930-08-03, 1930-08-04, 1930-08-05, 1…
$ prcp <dbl> 3.00, 0.09, NA, 0.02, 0.12, NA, NA, NA, 0.00, 0.00, 0.00, NA, …
$ snow <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ snwd <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ tavg <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ tmax <dbl> 99, 97, 95, 95, 92, 92, 96, 97, 94, 92, 99, 99, 98, 98, 98, 97…
$ tmin <dbl> 75, 79, 78, 79, 76, 74, 71, 71, 75, 72, 70, 71, 78, 72, 73, 70…
$ tobs <dbl> 86, 89, 89, 85, 83, 89, 83, 82, 85, 85, 87, 86, 86, 85, 86, 84…
$ yr <dbl> 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 19…
$ mn <ord> Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Au…
$ yd <dbl> 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 22…
$ city <chr> "Houston", "Houston", "Houston", "Houston", "Houston", "Housto…
Now check your results using distinct on name
and city
.
|> distinct(name, city) tx_names
OYO: Recode your cities
Make similar short names, but for your state.
Select
Create a new version of your data with only the columns you need, in the order you want them.
<- tx_names |>
tx_tight select(
city,
date,rain = prcp,
snow,
snwd,
tmax,
tmin,
yr,
mn,
yd
)
|> glimpse() tx_tight
Rows: 94,503
Columns: 10
$ city <chr> "Houston", "Houston", "Houston", "Houston", "Houston", "Houston",…
$ date <date> 1930-08-01, 1930-08-02, 1930-08-03, 1930-08-04, 1930-08-05, 1930…
$ rain <dbl> 3.00, 0.09, NA, 0.02, 0.12, NA, NA, NA, 0.00, 0.00, 0.00, NA, NA,…
$ snow <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ snwd <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ tmax <dbl> 99, 97, 95, 95, 92, 92, 96, 97, 94, 92, 99, 99, 98, 98, 98, 97, 9…
$ tmin <dbl> 75, 79, 78, 79, 76, 74, 71, 71, 75, 72, 70, 71, 78, 72, 73, 70, 7…
$ yr <dbl> 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930,…
$ mn <ord> Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, Aug, …
$ yd <dbl> 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, …
OYO: Select your cols
Go through the same process as above, but with your own state data.
Export
Write the file out as “rds” to the data-processed
folder.
|> write_rds("data-processed/tx_clean.rds") tx_tight
OYO: Export your state
Write your data to the data-processed
folder. Make sure you use a name for your state.
Checking your notebooks
Clear out your notebook and rerun all the code. Render the HTML page.