01-Cleaning

Setup

Importing the libraries I might or will definitely need.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

Import the Storm Data

Read the storm event csv’s into a single dataframe.

df <- read_csv(c(
  "data-raw/StormEvents_details-ftp_v1.0_d2000_c20220425.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2001_c20220425.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2002_c20220425.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2003_c20220425.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2004_c20220425.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2005_c20220425.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2006_c20220425.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2007_c20240216.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2008_c20220425.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2009_c20231116.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2010_c20220425.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2011_c20230417.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2012_c20221216.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2013_c20230118.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2014_c20231116.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2015_c20220425.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2016_c20220719.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2017_c20230317.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2018_c20230616.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2019_c20240117.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2020_c20231217.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2021_c20231217.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2022_c20240216.csv",
  "data-raw/StormEvents_details-ftp_v1.0_d2023_c20240216.csv"
  ))
Rows: 1447202 Columns: 51
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (26): STATE, MONTH_NAME, EVENT_TYPE, CZ_TYPE, CZ_NAME, WFO, BEGIN_DATE_T...
dbl (25): BEGIN_YEARMONTH, BEGIN_DAY, BEGIN_TIME, END_YEARMONTH, END_DAY, EN...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df |> glimpse()
Rows: 1,447,202
Columns: 51
$ BEGIN_YEARMONTH    <dbl> 200012, 200012, 200012, 200012, 200008, 200008, 200…
$ BEGIN_DAY          <dbl> 31, 31, 31, 13, 3, 9, 9, 21, 21, 16, 16, 17, 17, 1,…
$ BEGIN_TIME         <dbl> 600, 600, 700, 2200, 1410, 1405, 1615, 600, 600, 90…
$ END_YEARMONTH      <dbl> 200012, 200012, 200012, 200012, 200008, 200008, 200…
$ END_DAY            <dbl> 31, 31, 31, 14, 3, 9, 9, 22, 22, 17, 17, 18, 18, 29…
$ END_TIME           <dbl> 900, 900, 800, 400, 1410, 1405, 1615, 400, 400, 700…
$ EPISODE_ID         <dbl> 1104812, 1104812, 1104812, 1105342, 1101140, 110114…
$ EVENT_ID           <dbl> 5165377, 5165378, 5165379, 5165449, 5172568, 517256…
$ STATE              <chr> "FLORIDA", "FLORIDA", "FLORIDA", "WEST VIRGINIA", "…
$ STATE_FIPS         <dbl> 12, 12, 12, 54, 28, 28, 28, 23, 23, 23, 23, 9, 9, 1…
$ YEAR               <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 200…
$ MONTH_NAME         <chr> "December", "December", "December", "December", "Au…
$ EVENT_TYPE         <chr> "Extreme Cold/Wind Chill", "Extreme Cold/Wind Chill…
$ CZ_TYPE            <chr> "Z", "Z", "Z", "Z", "C", "C", "C", "Z", "Z", "Z", "…
$ CZ_FIPS            <dbl> 67, 70, 73, 23, 123, 69, 23, 30, 29, 15, 10, 7, 11,…
$ CZ_NAME            <chr> "INLAND PALM BEACH", "INLAND COLLIER", "INLAND DADE…
$ WFO                <chr> "MFL", "MFL", "MFL", "PBZ", "JAN", "JAN", "JAN", "C…
$ BEGIN_DATE_TIME    <chr> "31-DEC-00 06:00:00", "31-DEC-00 06:00:00", "31-DEC…
$ CZ_TIMEZONE        <chr> "EST", "EST", "EST", "EST", "CST", "CST", "CST", "E…
$ END_DATE_TIME      <chr> "31-DEC-00 09:00:00", "31-DEC-00 09:00:00", "31-DEC…
$ INJURIES_DIRECT    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ INJURIES_INDIRECT  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ DEATHS_DIRECT      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ DEATHS_INDIRECT    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ DAMAGE_PROPERTY    <chr> NA, NA, NA, NA, "2K", "2K", "1K", NA, NA, NA, NA, N…
$ DAMAGE_CROPS       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ SOURCE             <chr> "NEWSPAPER", "NEWSPAPER", "NEWSPAPER", "LAW ENFORCE…
$ MAGNITUDE          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ MAGNITUDE_TYPE     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ FLOOD_CAUSE        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ CATEGORY           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_F_SCALE        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_LENGTH         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_WIDTH          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_OTHER_WFO      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_OTHER_CZ_STATE <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_OTHER_CZ_FIPS  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_OTHER_CZ_NAME  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ BEGIN_RANGE        <dbl> NA, NA, NA, NA, NA, 5, 2, NA, NA, NA, NA, NA, NA, N…
$ BEGIN_AZIMUTH      <chr> NA, NA, NA, NA, NA, "S", "NW", NA, NA, NA, NA, NA, …
$ BEGIN_LOCATION     <chr> NA, NA, NA, NA, "FORKVILLE", "DE KALB", "SHUBUTA", …
$ END_RANGE          <dbl> NA, NA, NA, NA, NA, 5, 2, NA, NA, NA, NA, NA, NA, N…
$ END_AZIMUTH        <chr> NA, NA, NA, NA, NA, "S", "NW", NA, NA, NA, NA, NA, …
$ END_LOCATION       <chr> NA, NA, NA, NA, "FORKVILLE", "DE KALB", "SHUBUTA", …
$ BEGIN_LAT          <dbl> NA, NA, NA, NA, 32.45000, 32.70000, 31.86667, NA, N…
$ BEGIN_LON          <dbl> NA, NA, NA, NA, -89.65000, -88.65000, -88.71667, NA…
$ END_LAT            <dbl> NA, NA, NA, NA, 32.45000, 32.70000, 31.86667, NA, N…
$ END_LON            <dbl> NA, NA, NA, NA, -89.65000, -88.65000, -88.71667, NA…
$ EPISODE_NARRATIVE  <chr> "Temperatures fell into the mid-20s over Glades, He…
$ EVENT_NARRATIVE    <chr> NA, NA, NA, NA, "Several trees were blown down alon…
$ DATA_SOURCE        <chr> "PDC", "PDC", "PDC", "PDC", "PDC", "PDC", "PDC", "P…

Narrow Results

I am only interested in storms in Texas for now. So I’m going to filter for Texas.

texas <- df |> filter( STATE == "TEXAS")

texas |> glimpse()
Rows: 104,554
Columns: 51
$ BEGIN_YEARMONTH    <dbl> 200001, 200001, 200001, 200001, 200001, 200001, 200…
$ BEGIN_DAY          <dbl> 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,…
$ BEGIN_TIME         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1200, …
$ END_YEARMONTH      <dbl> 200001, 200001, 200001, 200001, 200001, 200001, 200…
$ END_DAY            <dbl> 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,…
$ END_TIME           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2300, …
$ EPISODE_ID         <dbl> 1090451, 1090451, 1090451, 1090451, 1090451, 109045…
$ EVENT_ID           <dbl> 5129018, 5129030, 5129031, 5129032, 5129005, 512902…
$ STATE              <chr> "TEXAS", "TEXAS", "TEXAS", "TEXAS", "TEXAS", "TEXAS…
$ STATE_FIPS         <dbl> 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,…
$ YEAR               <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 200…
$ MONTH_NAME         <chr> "January", "January", "January", "January", "Januar…
$ EVENT_TYPE         <chr> "Winter Storm", "Winter Storm", "Winter Storm", "Wi…
$ CZ_TYPE            <chr> "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "…
$ CZ_FIPS            <dbl> 98, 161, 159, 174, 92, 95, 157, 119, 106, 101, 103,…
$ CZ_NAME            <chr> "HASKELL", "LIMESTONE", "MCLENNAN", "MILAM", "COOKE…
$ WFO                <chr> "FWD", "FWD", "FWD", "FWD", "FWD", "FWD", "FWD", "F…
$ BEGIN_DATE_TIME    <chr> "25-JAN-00 00:00:00", "25-JAN-00 00:00:00", "25-JAN…
$ CZ_TIMEZONE        <chr> "CST", "CST", "CST", "CST", "CST", "CST", "CST", "C…
$ END_DATE_TIME      <chr> "28-JAN-00 00:00:00", "28-JAN-00 00:00:00", "28-JAN…
$ INJURIES_DIRECT    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ INJURIES_INDIRECT  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ DEATHS_DIRECT      <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ DEATHS_INDIRECT    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ DAMAGE_PROPERTY    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ DAMAGE_CROPS       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ SOURCE             <chr> "NEWSPAPER", "NEWSPAPER", "NEWSPAPER", "NEWSPAPER",…
$ MAGNITUDE          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ MAGNITUDE_TYPE     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ FLOOD_CAUSE        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ CATEGORY           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_F_SCALE        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_LENGTH         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_WIDTH          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_OTHER_WFO      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_OTHER_CZ_STATE <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_OTHER_CZ_FIPS  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TOR_OTHER_CZ_NAME  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ BEGIN_RANGE        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ BEGIN_AZIMUTH      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ BEGIN_LOCATION     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ END_RANGE          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ END_AZIMUTH        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ END_LOCATION       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ BEGIN_LAT          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ BEGIN_LON          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ END_LAT            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ END_LON            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ EPISODE_NARRATIVE  <chr> "An arctic cold front moved through North Texas on …
$ EVENT_NARRATIVE    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ DATA_SOURCE        <chr> "PDC", "PDC", "PDC", "PDC", "PDC", "PDC", "PDC", "P…

Select Columns

I am going to select the columns I think I want to work with.

storms <- texas |> select(
  EVENT_TYPE,
  BEGIN_DATE_TIME,
  END_DATE_TIME,
  YEAR,
  INJURIES_DIRECT,
  INJURIES_INDIRECT,
  DEATHS_DIRECT,
  DEATHS_INDIRECT,
  DAMAGE_PROPERTY,
  DAMAGE_CROPS,
  CZ_TYPE,
  CZ_FIPS,
  CZ_NAME
  )

storms |> glimpse()
Rows: 104,554
Columns: 13
$ EVENT_TYPE        <chr> "Winter Storm", "Winter Storm", "Winter Storm", "Win…
$ BEGIN_DATE_TIME   <chr> "25-JAN-00 00:00:00", "25-JAN-00 00:00:00", "25-JAN-…
$ END_DATE_TIME     <chr> "28-JAN-00 00:00:00", "28-JAN-00 00:00:00", "28-JAN-…
$ YEAR              <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000…
$ INJURIES_DIRECT   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ INJURIES_INDIRECT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ DEATHS_DIRECT     <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ DEATHS_INDIRECT   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ DAMAGE_PROPERTY   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ DAMAGE_CROPS      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ CZ_TYPE           <chr> "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z…
$ CZ_FIPS           <dbl> 98, 161, 159, 174, 92, 95, 157, 119, 106, 101, 103, …
$ CZ_NAME           <chr> "HASKELL", "LIMESTONE", "MCLENNAN", "MILAM", "COOKE"…

Change types for some columns

The damages columns are in a weird format with numbers and letters. I need them to be actual numbers so I can work with them. The date column is also the wrong type so I will change that too.

First, I am making the damage columns become numbers I can compare, and I am separating the unit (k,m,b) from the number.

separate_col_damage <- storms |> mutate(
  number1 = str_sub(DAMAGE_PROPERTY, end = -2),
  unit1 = str_sub(DAMAGE_PROPERTY, -1:-1),
  number2 = str_sub(DAMAGE_CROPS, end = -2),
  unit2 = str_sub(DAMAGE_CROPS, -1:-1)
)

separate_col_damage

Next, I am making the number columns numerals.

damage_col_num <- separate_col_damage |> mutate(
  number1 = as.numeric(number1),
  number2 = as.numeric(number2))
  
damage_col_num

Finally, I am using the units to multiple the number column and make a damage columns with the actual values.

damage_col <- damage_col_num |> mutate(
  damage_val_prop = if_else(unit1 == "K", number1 * 1000, if_else(unit1 == "M", number1 * 1000000, if_else(unit1 == "B", number1 * 1000000000, number1 ))),
  damage_val_crop = if_else(unit2 == "K", number2 * 1000, if_else(unit2 == "M", number2 * 1000000, if_else(unit2 == "B", number2 * 1000000000, number1 )))
)

damage_col |> select(
  DAMAGE_PROPERTY,
  damage_val_prop,
  DAMAGE_CROPS,
  damage_val_crop
)

Now, I am changing the date into a date type.

date_storm <- damage_col |> mutate(
  begin_date = dmy(paste(substring(BEGIN_DATE_TIME, 0, 7), YEAR, sep = '')),
  end_date = dmy(paste(substring(END_DATE_TIME, 0, 7), YEAR, sep = '')))

date_storm |> glimpse()
Rows: 104,554
Columns: 21
$ EVENT_TYPE        <chr> "Winter Storm", "Winter Storm", "Winter Storm", "Win…
$ BEGIN_DATE_TIME   <chr> "25-JAN-00 00:00:00", "25-JAN-00 00:00:00", "25-JAN-…
$ END_DATE_TIME     <chr> "28-JAN-00 00:00:00", "28-JAN-00 00:00:00", "28-JAN-…
$ YEAR              <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000…
$ INJURIES_DIRECT   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ INJURIES_INDIRECT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ DEATHS_DIRECT     <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ DEATHS_INDIRECT   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ DAMAGE_PROPERTY   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ DAMAGE_CROPS      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ CZ_TYPE           <chr> "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z…
$ CZ_FIPS           <dbl> 98, 161, 159, 174, 92, 95, 157, 119, 106, 101, 103, …
$ CZ_NAME           <chr> "HASKELL", "LIMESTONE", "MCLENNAN", "MILAM", "COOKE"…
$ number1           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ unit1             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ number2           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ unit2             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ damage_val_prop   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ damage_val_crop   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ begin_date        <date> 2000-01-25, 2000-01-25, 2000-01-25, 2000-01-25, 200…
$ end_date          <date> 2000-01-28, 2000-01-28, 2000-01-28, 2000-01-28, 200…

I’m also going to uppercase all the values in the county column. First, I check to see how many could be wrong. Then I change the column to all uppercase. Then I check that the change happened.

#check that
date_storm |> filter(CZ_NAME == "Travis")
cleaned_storm <- date_storm |> mutate(
  CZ_NAME = str_to_upper(CZ_NAME)
)

cleaned_storm |> filter(CZ_NAME == "Travis")

Select important columns

Finally I am going to get rid of the columns I created or other columns that I don’t need anymore.

clean_storm <- cleaned_storm |> select(
  -BEGIN_DATE_TIME,
  -END_DATE_TIME,
  -YEAR,
  -number1,
  -unit1,
  -number2,
  -unit2,
  -DAMAGE_PROPERTY,
  -DAMAGE_CROPS
) 

clean_storm |> glimpse()
Rows: 104,554
Columns: 12
$ EVENT_TYPE        <chr> "Winter Storm", "Winter Storm", "Winter Storm", "Win…
$ INJURIES_DIRECT   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ INJURIES_INDIRECT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ DEATHS_DIRECT     <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ DEATHS_INDIRECT   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ CZ_TYPE           <chr> "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z…
$ CZ_FIPS           <dbl> 98, 161, 159, 174, 92, 95, 157, 119, 106, 101, 103, …
$ CZ_NAME           <chr> "HASKELL", "LIMESTONE", "MCLENNAN", "MILAM", "COOKE"…
$ damage_val_prop   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ damage_val_crop   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ begin_date        <date> 2000-01-25, 2000-01-25, 2000-01-25, 2000-01-25, 200…
$ end_date          <date> 2000-01-28, 2000-01-28, 2000-01-28, 2000-01-28, 200…

Export

Exporting the file as an rds so I can do an analysis.

clean_storm |> write_rds("data-processed/01-storm-data.rds")

```