Cleaning

Goals of this notebook

  1. Import all data
  2. Merge all sheets
  3. Remove duplicate rows
  4. Geocode
  5. Clean names in OpenRefine
  6. Fix date data types and group by dates
  7. Create and rename final columns
  8. Export clean data

Setup

library(tidyverse)
library(janitor)
library(stringr)
library(readxl)

1. Importing

Data set 1.

unhoused_raw_1 <- suppressWarnings(read_excel("data-raw/unhoused-raw-1.xlsx"))

Data set 2.

unhoused_raw_2 <- suppressWarnings(read_excel("data-raw/unhoused-raw-2.xlsx"))

Data set 3.

unhoused_raw_3 <- suppressWarnings(read_excel("data-raw/unhoused-raw-3.xlsx"))

Data set 4.

unhoused_raw_4 <- suppressWarnings(read_excel("data-raw/unhoused-raw-4.xlsx"))

Data set 5.

unhoused_raw_5 <- suppressWarnings(read_excel("data-raw/unhoused-raw-5.xlsx"))

2. Merging data sets

2.1 Evaluating merge readiness

Ensuring column names match.

unhoused_raw_1 |> glimpse()
Rows: 28,603
Columns: 15
$ Citation_Number             <chr> "N29599464", "N29599455", "N29599455", "N2…
$ Case_Number                 <chr> "N29599464-02 NT 2016", "N29599455-01 NT 2…
$ Offense_Date                <dttm> 2016-01-02, 2016-01-02, 2016-01-02, 2016-…
$ Defendant_Name              <chr> "KREUANILTITH, VIENG", "HARRISON, ROBIN", …
$ Defendent_Address           <chr> "HOMELESS", "HOMELESS", "HOMELESS", "HOMEL…
$ Phone_Number                <dbl> NA, NA, NA, NA, 7136970095, NA, NA, 281515…
$ Offense_location            <chr> "700, PRARIE", "300, FANNIN", "300, FANNIN…
$ Violation_Code              <chr> "CC941", "CC940", "CC941", "CC940", "CC940…
$ Violation_Description       <chr> "PLACE (BED MAT./PERSONAL POSS.) ON SIDEWA…
$ Totla_Fine_Amount_Due       <dbl> 0.0, 0.0, 0.0, 0.0, 278.2, 0.0, 0.0, 0.0, …
$ Totla_Fine_Amount_Paid      <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 239.2, 310.7…
$ Totla_Fine_Amount_Dismissed <dbl> 310.7, 239.0, 239.0, 310.7, 0.0, 184.0, 0.…
$ Judgment_Date               <dttm> 2016-09-10, 2016-02-24, 2016-02-24, 2016-…
$ Judgment                    <chr> "CASE DISMISSED - OFFICER NOT PRESENT", "C…
$ Disposition                 <chr> "DISMISSED AT TRIAL", "DISMISSED AT TRIAL"…
unhoused_raw_2 |> glimpse()
Rows: 344
Columns: 15
$ `Citation Number`             <chr> "M81035523", "M81037597", "N37932142", "…
$ `Case Number`                 <chr> "M81035523-02 NT 2025", "M81037597-01 NT…
$ `Offense Date`                <dttm> 2025-08-18, 2025-08-18, 2025-08-18, 202…
$ `Defendant Name`              <chr> "BAKER, GARY", "SMITH, FOREST", "WOODS, …
$ `Defendant Address`           <chr> "HOMELESS  HOUSTON,77002 TX", "HOMELESS …
$ `Defendant Phone Number`      <chr> NA, NA, NA, NA, NA, NA, NA, NA, "8325456…
$ `Offense Location`            <chr> "1900 RUNNELS STREET", "1900 CHARTRES ST…
$ `Violation Code`              <chr> "CC941", "CC940", "CC940", "CC940", "CC9…
$ `Violation Description`       <chr> "PLACE (BED MAT./PERSONAL POSS.) ON SIDE…
$ `Total Fine Amount Due`       <dbl> 194, 194, 194, 194, 209, 194, 194, 209, …
$ `Total Fine Amount Paid`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `Total Fine Amount Dismissed` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `Judgment Date`               <dttm> 2025-09-11, 2025-09-19, 2025-09-24, 202…
$ Judgment                      <chr> "FAILURE TO APPEAR", "FAILURE TO APPEAR"…
$ Disposition                   <chr> "UNDISPOSED", "UNDISPOSED", "UNDISPOSED"…
unhoused_raw_3 |> glimpse()
Rows: 1,136
Columns: 15
$ `Citation Number`             <chr> "M81027684", "M81035492", "M81035492", "…
$ `Case Number`                 <chr> "M81027684-02 NT 2025", "M81035492-01 NT…
$ `Offense Date`                <dttm> 2025-07-15, 2025-07-16, 2025-07-16, 202…
$ `Defendant Name`              <chr> "BAILEY, LEBERT", "BAYRON, LOUIS S", "BA…
$ `Defendant Address`           <chr> "HOMELESS  ,", "HOMELESS  HOUSTON,77002 …
$ `Defendant Phone Number`      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ `Offense Location`            <chr> "2000 BRAZOS STREET", "1900 CAPITOL STRE…
$ `Violation Code`              <chr> "CC940", "CC940", "CC941", "CC948", "CC9…
$ `Violation Description`       <chr> "SIT/LIE DOWN ON (BLANKET/STOOL) PLACED …
$ `Total Fine Amount Due`       <dbl> 0.0, 194.0, 194.0, 209.0, 265.2, 271.7, …
$ `Total Fine Amount Paid`      <dbl> 199, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `Total Fine Amount Dismissed` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `Judgment Date`               <dttm> 2025-07-16, 2025-10-06, 2025-10-06, 202…
$ Judgment                      <chr> "CREDIT TIME SERVED", "TRIAL (NON JURY)"…
$ Disposition                   <chr> "GUILTY TRIAL BY JUDGE", "UNDISPOSED", "…
unhoused_raw_4 |> glimpse()
Rows: 528
Columns: 15
$ `Citation Number`             <chr> "N40168816", "N40373496", "N40395117", "…
$ `Case Number`                 <chr> "N40168816-03 NT 2025", "N40373496-01 NT…
$ `Offense Date`                <dttm> 2025-10-15, 2025-10-15, 2025-10-15, 202…
$ `Defendant Name`              <chr> "AMAYA, WILMER", "CROSBY, SELWYN", "GARC…
$ `Defendant Address`           <chr> "6727 TELEPHONE RD #330  HOUSTON,77061 T…
$ `Defendant Phone Number`      <dbl> 8329403301, 2815288883, NA, NA, NA, NA, …
$ `Offense Location`            <chr> "8900 TOWN PARK DR.", "2100 PRESTON ST",…
$ `Violation Code`              <chr> "CC753", "CC940", "CC940", "CC940", "CC9…
$ `Violation Description`       <chr> "STREET VENDOR (EXPOSE FOR SALE) (SELL) …
$ `Total Fine Amount Due`       <dbl> 394, 194, 194, 194, 184, 194, 194, 209, …
$ `Total Fine Amount Paid`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `Total Fine Amount Dismissed` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `Judgment Date`               <dttm> 2025-11-18, 2025-11-19, 2025-11-20, 202…
$ Judgment                      <chr> "TRIAL (NON JURY)", "FAILURE TO APPEAR",…
$ Disposition                   <chr> "UNDISPOSED", "UNDISPOSED", "UNDISPOSED"…
unhoused_raw_5 |> glimpse()
Rows: 1,015
Columns: 15
$ `Citation Number`             <chr> "N40222481", "N40222497", "M81024487", "…
$ `Case Number`                 <chr> "N40222481-01 NT 2025", "N40222497-01 NT…
$ `Offense Date`                <dttm> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ `Defendant Name`              <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ `Defendant Address`           <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIG…
$ `Defendant Phone Number`      <chr> NA, NA, "3468224952", NA, "3468224952", …
$ `Offense Location`            <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WH…
$ `Violation Code`              <chr> "CC940", "CC940", "CC940", "CC940", "CC9…
$ `Violation Description`       <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS…
$ `Total Fine Amount Due`       <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0,…
$ `Total Fine Amount Paid`      <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0…
$ `Total Fine Amount Dismissed` <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, …
$ `Judgment Date`               <dttm> 2025-12-08, 2025-12-08, 2026-01-08, 202…
$ Judgment                      <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ Disposition                   <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…

2.3 Cleaning names

2.3.1 Data set 1

unhoused_names_1 <- unhoused_raw_1 |> 
  rename(defendant_address = Defendent_Address) |> # this was misspelled
  clean_names()

unhoused_names_1 |> glimpse()
Rows: 28,603
Columns: 15
$ citation_number             <chr> "N29599464", "N29599455", "N29599455", "N2…
$ case_number                 <chr> "N29599464-02 NT 2016", "N29599455-01 NT 2…
$ offense_date                <dttm> 2016-01-02, 2016-01-02, 2016-01-02, 2016-…
$ defendant_name              <chr> "KREUANILTITH, VIENG", "HARRISON, ROBIN", …
$ defendant_address           <chr> "HOMELESS", "HOMELESS", "HOMELESS", "HOMEL…
$ phone_number                <dbl> NA, NA, NA, NA, 7136970095, NA, NA, 281515…
$ offense_location            <chr> "700, PRARIE", "300, FANNIN", "300, FANNIN…
$ violation_code              <chr> "CC941", "CC940", "CC941", "CC940", "CC940…
$ violation_description       <chr> "PLACE (BED MAT./PERSONAL POSS.) ON SIDEWA…
$ totla_fine_amount_due       <dbl> 0.0, 0.0, 0.0, 0.0, 278.2, 0.0, 0.0, 0.0, …
$ totla_fine_amount_paid      <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 239.2, 310.7…
$ totla_fine_amount_dismissed <dbl> 310.7, 239.0, 239.0, 310.7, 0.0, 184.0, 0.…
$ judgment_date               <dttm> 2016-09-10, 2016-02-24, 2016-02-24, 2016-…
$ judgment                    <chr> "CASE DISMISSED - OFFICER NOT PRESENT", "C…
$ disposition                 <chr> "DISMISSED AT TRIAL", "DISMISSED AT TRIAL"…

2.3.2 Data set 2

unhoused_names_2 <- unhoused_raw_2 |> 
  clean_names()

unhoused_names_2 |> glimpse()
Rows: 344
Columns: 15
$ citation_number             <chr> "M81035523", "M81037597", "N37932142", "M8…
$ case_number                 <chr> "M81035523-02 NT 2025", "M81037597-01 NT 2…
$ offense_date                <dttm> 2025-08-18, 2025-08-18, 2025-08-18, 2025-…
$ defendant_name              <chr> "BAKER, GARY", "SMITH, FOREST", "WOODS, MI…
$ defendant_address           <chr> "HOMELESS  HOUSTON,77002 TX", "HOMELESS  ,…
$ defendant_phone_number      <chr> NA, NA, NA, NA, NA, NA, NA, NA, "832545658…
$ offense_location            <chr> "1900 RUNNELS STREET", "1900 CHARTRES STRE…
$ violation_code              <chr> "CC941", "CC940", "CC940", "CC940", "CC948…
$ violation_description       <chr> "PLACE (BED MAT./PERSONAL POSS.) ON SIDEWA…
$ total_fine_amount_due       <dbl> 194, 194, 194, 194, 209, 194, 194, 209, 18…
$ total_fine_amount_paid      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ judgment_date               <dttm> 2025-09-11, 2025-09-19, 2025-09-24, 2025-…
$ judgment                    <chr> "FAILURE TO APPEAR", "FAILURE TO APPEAR", …
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "UNDISPOSED", …

2.3.3 Data set 3

unhoused_names_3 <- unhoused_raw_3 |> 
  clean_names()

unhoused_names_3 |> glimpse()
Rows: 1,136
Columns: 15
$ citation_number             <chr> "M81027684", "M81035492", "M81035492", "M8…
$ case_number                 <chr> "M81027684-02 NT 2025", "M81035492-01 NT 2…
$ offense_date                <dttm> 2025-07-15, 2025-07-16, 2025-07-16, 2025-…
$ defendant_name              <chr> "BAILEY, LEBERT", "BAYRON, LOUIS S", "BAYR…
$ defendant_address           <chr> "HOMELESS  ,", "HOMELESS  HOUSTON,77002 TX…
$ defendant_phone_number      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ offense_location            <chr> "2000 BRAZOS STREET", "1900 CAPITOL STREET…
$ violation_code              <chr> "CC940", "CC940", "CC941", "CC948", "CC941…
$ violation_description       <chr> "SIT/LIE DOWN ON (BLANKET/STOOL) PLACED ON…
$ total_fine_amount_due       <dbl> 0.0, 194.0, 194.0, 209.0, 265.2, 271.7, 25…
$ total_fine_amount_paid      <dbl> 199, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ judgment_date               <dttm> 2025-07-16, 2025-10-06, 2025-10-06, 2025-…
$ judgment                    <chr> "CREDIT TIME SERVED", "TRIAL (NON JURY)", …
$ disposition                 <chr> "GUILTY TRIAL BY JUDGE", "UNDISPOSED", "UN…

2.3.4 Data set 4

unhoused_names_4 <- unhoused_raw_4 |> 
  clean_names()

unhoused_names_4 |> glimpse()
Rows: 528
Columns: 15
$ citation_number             <chr> "N40168816", "N40373496", "N40395117", "N4…
$ case_number                 <chr> "N40168816-03 NT 2025", "N40373496-01 NT 2…
$ offense_date                <dttm> 2025-10-15, 2025-10-15, 2025-10-15, 2025-…
$ defendant_name              <chr> "AMAYA, WILMER", "CROSBY, SELWYN", "GARCIA…
$ defendant_address           <chr> "6727 TELEPHONE RD #330  HOUSTON,77061 TX"…
$ defendant_phone_number      <dbl> 8329403301, 2815288883, NA, NA, NA, NA, NA…
$ offense_location            <chr> "8900 TOWN PARK DR.", "2100 PRESTON ST", "…
$ violation_code              <chr> "CC753", "CC940", "CC940", "CC940", "CC921…
$ violation_description       <chr> "STREET VENDOR (EXPOSE FOR SALE) (SELL) GO…
$ total_fine_amount_due       <dbl> 394, 194, 194, 194, 184, 194, 194, 209, 29…
$ total_fine_amount_paid      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ judgment_date               <dttm> 2025-11-18, 2025-11-19, 2025-11-20, 2025-…
$ judgment                    <chr> "TRIAL (NON JURY)", "FAILURE TO APPEAR", "…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "UNDISPOSED", …

2.3.5 Data set 5

unhoused_names_5 <- unhoused_raw_5 |> 
  clean_names()

unhoused_names_5 |> glimpse()
Rows: 1,015
Columns: 15
$ citation_number             <chr> "N40222481", "N40222497", "M81024487", "N4…
$ case_number                 <chr> "N40222481-01 NT 2025", "N40222497-01 NT 2…
$ offense_date                <dttm> 2025-10-01, 2025-10-01, 2025-10-01, 2025-…
$ defendant_name              <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR",…
$ defendant_address           <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIGAT…
$ defendant_phone_number      <chr> NA, NA, "3468224952", NA, "3468224952", NA…
$ offense_location            <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WHEE…
$ violation_code              <chr> "CC940", "CC940", "CC940", "CC940", "CC941…
$ violation_description       <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS S…
$ total_fine_amount_due       <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0, 2…
$ total_fine_amount_paid      <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0.0…
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, 0,…
$ judgment_date               <dttm> 2025-12-08, 2025-12-08, 2026-01-08, 2025-…
$ judgment                    <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAILUR…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIAL …

2.4 Preparing to merge

2.4.1 Data set 1

Fixing phone_number data type, fixing misspellings and adding data_origin column to data set 1.

unhoused_tidy_1 <- unhoused_names_1 |>
  mutate(phone_number = as.character(phone_number),
         data_origin = "unhoused_1"
         ) |> 
  rename(total_fine_amount_due = totla_fine_amount_due,
         total_fine_amount_paid = totla_fine_amount_paid,
         total_fine_amount_dismissed = totla_fine_amount_dismissed)

unhoused_tidy_1 |> glimpse()
Rows: 28,603
Columns: 16
$ citation_number             <chr> "N29599464", "N29599455", "N29599455", "N2…
$ case_number                 <chr> "N29599464-02 NT 2016", "N29599455-01 NT 2…
$ offense_date                <dttm> 2016-01-02, 2016-01-02, 2016-01-02, 2016-…
$ defendant_name              <chr> "KREUANILTITH, VIENG", "HARRISON, ROBIN", …
$ defendant_address           <chr> "HOMELESS", "HOMELESS", "HOMELESS", "HOMEL…
$ phone_number                <chr> NA, NA, NA, NA, "7136970095", NA, NA, "281…
$ offense_location            <chr> "700, PRARIE", "300, FANNIN", "300, FANNIN…
$ violation_code              <chr> "CC941", "CC940", "CC941", "CC940", "CC940…
$ violation_description       <chr> "PLACE (BED MAT./PERSONAL POSS.) ON SIDEWA…
$ total_fine_amount_due       <dbl> 0.0, 0.0, 0.0, 0.0, 278.2, 0.0, 0.0, 0.0, …
$ total_fine_amount_paid      <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 239.2, 310.7…
$ total_fine_amount_dismissed <dbl> 310.7, 239.0, 239.0, 310.7, 0.0, 184.0, 0.…
$ judgment_date               <dttm> 2016-09-10, 2016-02-24, 2016-02-24, 2016-…
$ judgment                    <chr> "CASE DISMISSED - OFFICER NOT PRESENT", "C…
$ disposition                 <chr> "DISMISSED AT TRIAL", "DISMISSED AT TRIAL"…
$ data_origin                 <chr> "unhoused_1", "unhoused_1", "unhoused_1", …

2.4.2 Data set 2

Adding data_origin column to data set 2 and renaming columns to match data set 1.

unhoused_tidy_2 <- unhoused_names_2 |> 
  mutate(data_origin = "unhoused_2") |> 
  rename(phone_number = defendant_phone_number)

unhoused_tidy_2 |> glimpse()
Rows: 344
Columns: 16
$ citation_number             <chr> "M81035523", "M81037597", "N37932142", "M8…
$ case_number                 <chr> "M81035523-02 NT 2025", "M81037597-01 NT 2…
$ offense_date                <dttm> 2025-08-18, 2025-08-18, 2025-08-18, 2025-…
$ defendant_name              <chr> "BAKER, GARY", "SMITH, FOREST", "WOODS, MI…
$ defendant_address           <chr> "HOMELESS  HOUSTON,77002 TX", "HOMELESS  ,…
$ phone_number                <chr> NA, NA, NA, NA, NA, NA, NA, NA, "832545658…
$ offense_location            <chr> "1900 RUNNELS STREET", "1900 CHARTRES STRE…
$ violation_code              <chr> "CC941", "CC940", "CC940", "CC940", "CC948…
$ violation_description       <chr> "PLACE (BED MAT./PERSONAL POSS.) ON SIDEWA…
$ total_fine_amount_due       <dbl> 194, 194, 194, 194, 209, 194, 194, 209, 18…
$ total_fine_amount_paid      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ judgment_date               <dttm> 2025-09-11, 2025-09-19, 2025-09-24, 2025-…
$ judgment                    <chr> "FAILURE TO APPEAR", "FAILURE TO APPEAR", …
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "UNDISPOSED", …
$ data_origin                 <chr> "unhoused_2", "unhoused_2", "unhoused_2", …

2.4.3 Data set 3

Adding data_origin column to data set 3 and renaming columns to match the other data sets.

unhoused_tidy_3 <- unhoused_names_3 |> 
  mutate(data_origin = "unhoused_3") |> 
  rename(phone_number = defendant_phone_number)

unhoused_tidy_3 |> glimpse()
Rows: 1,136
Columns: 16
$ citation_number             <chr> "M81027684", "M81035492", "M81035492", "M8…
$ case_number                 <chr> "M81027684-02 NT 2025", "M81035492-01 NT 2…
$ offense_date                <dttm> 2025-07-15, 2025-07-16, 2025-07-16, 2025-…
$ defendant_name              <chr> "BAILEY, LEBERT", "BAYRON, LOUIS S", "BAYR…
$ defendant_address           <chr> "HOMELESS  ,", "HOMELESS  HOUSTON,77002 TX…
$ phone_number                <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ offense_location            <chr> "2000 BRAZOS STREET", "1900 CAPITOL STREET…
$ violation_code              <chr> "CC940", "CC940", "CC941", "CC948", "CC941…
$ violation_description       <chr> "SIT/LIE DOWN ON (BLANKET/STOOL) PLACED ON…
$ total_fine_amount_due       <dbl> 0.0, 194.0, 194.0, 209.0, 265.2, 271.7, 25…
$ total_fine_amount_paid      <dbl> 199, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ judgment_date               <dttm> 2025-07-16, 2025-10-06, 2025-10-06, 2025-…
$ judgment                    <chr> "CREDIT TIME SERVED", "TRIAL (NON JURY)", …
$ disposition                 <chr> "GUILTY TRIAL BY JUDGE", "UNDISPOSED", "UN…
$ data_origin                 <chr> "unhoused_3", "unhoused_3", "unhoused_3", …

2.4.4 Data set 4

Fixing phone_number data type and adding data_origin column to data set 4. Renaming columns to match the other data sets.

unhoused_tidy_4 <- unhoused_names_4 |> 
  rename(phone_number = defendant_phone_number) |> 
  mutate(phone_number = as.character(phone_number),
         data_origin = "unhoused_4")

unhoused_tidy_4 |> glimpse()
Rows: 528
Columns: 16
$ citation_number             <chr> "N40168816", "N40373496", "N40395117", "N4…
$ case_number                 <chr> "N40168816-03 NT 2025", "N40373496-01 NT 2…
$ offense_date                <dttm> 2025-10-15, 2025-10-15, 2025-10-15, 2025-…
$ defendant_name              <chr> "AMAYA, WILMER", "CROSBY, SELWYN", "GARCIA…
$ defendant_address           <chr> "6727 TELEPHONE RD #330  HOUSTON,77061 TX"…
$ phone_number                <chr> "8329403301", "2815288883", NA, NA, NA, NA…
$ offense_location            <chr> "8900 TOWN PARK DR.", "2100 PRESTON ST", "…
$ violation_code              <chr> "CC753", "CC940", "CC940", "CC940", "CC921…
$ violation_description       <chr> "STREET VENDOR (EXPOSE FOR SALE) (SELL) GO…
$ total_fine_amount_due       <dbl> 394, 194, 194, 194, 184, 194, 194, 209, 29…
$ total_fine_amount_paid      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ judgment_date               <dttm> 2025-11-18, 2025-11-19, 2025-11-20, 2025-…
$ judgment                    <chr> "TRIAL (NON JURY)", "FAILURE TO APPEAR", "…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "UNDISPOSED", …
$ data_origin                 <chr> "unhoused_4", "unhoused_4", "unhoused_4", …

2.4.5 Data set 5

Fixing phone_number data type and adding data_origin column to data set 5. Renaming columns to match the other data sets.

unhoused_tidy_5 <- unhoused_names_5 |> 
  rename(phone_number = defendant_phone_number) |> 
  mutate(phone_number = as.character(phone_number),
         data_origin = "unhoused_5")

unhoused_tidy_5 |> glimpse()
Rows: 1,015
Columns: 16
$ citation_number             <chr> "N40222481", "N40222497", "M81024487", "N4…
$ case_number                 <chr> "N40222481-01 NT 2025", "N40222497-01 NT 2…
$ offense_date                <dttm> 2025-10-01, 2025-10-01, 2025-10-01, 2025-…
$ defendant_name              <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR",…
$ defendant_address           <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIGAT…
$ phone_number                <chr> NA, NA, "3468224952", NA, "3468224952", NA…
$ offense_location            <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WHEE…
$ violation_code              <chr> "CC940", "CC940", "CC940", "CC940", "CC941…
$ violation_description       <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS S…
$ total_fine_amount_due       <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0, 2…
$ total_fine_amount_paid      <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0.0…
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, 0,…
$ judgment_date               <dttm> 2025-12-08, 2025-12-08, 2026-01-08, 2025-…
$ judgment                    <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAILUR…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIAL …
$ data_origin                 <chr> "unhoused_5", "unhoused_5", "unhoused_5", …

2.5 Merging sheets

Merging sheets.

unhoused_merged <- bind_rows(unhoused_tidy_1, unhoused_tidy_2, unhoused_tidy_3, unhoused_tidy_4, unhoused_tidy_5)

unhoused_merged |> glimpse()
Rows: 31,626
Columns: 16
$ citation_number             <chr> "N29599464", "N29599455", "N29599455", "N2…
$ case_number                 <chr> "N29599464-02 NT 2016", "N29599455-01 NT 2…
$ offense_date                <dttm> 2016-01-02, 2016-01-02, 2016-01-02, 2016-…
$ defendant_name              <chr> "KREUANILTITH, VIENG", "HARRISON, ROBIN", …
$ defendant_address           <chr> "HOMELESS", "HOMELESS", "HOMELESS", "HOMEL…
$ phone_number                <chr> NA, NA, NA, NA, "7136970095", NA, NA, "281…
$ offense_location            <chr> "700, PRARIE", "300, FANNIN", "300, FANNIN…
$ violation_code              <chr> "CC941", "CC940", "CC941", "CC940", "CC940…
$ violation_description       <chr> "PLACE (BED MAT./PERSONAL POSS.) ON SIDEWA…
$ total_fine_amount_due       <dbl> 0.0, 0.0, 0.0, 0.0, 278.2, 0.0, 0.0, 0.0, …
$ total_fine_amount_paid      <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 239.2, 310.7…
$ total_fine_amount_dismissed <dbl> 310.7, 239.0, 239.0, 310.7, 0.0, 184.0, 0.…
$ judgment_date               <dttm> 2016-09-10, 2016-02-24, 2016-02-24, 2016-…
$ judgment                    <chr> "CASE DISMISSED - OFFICER NOT PRESENT", "C…
$ disposition                 <chr> "DISMISSED AT TRIAL", "DISMISSED AT TRIAL"…
$ data_origin                 <chr> "unhoused_1", "unhoused_1", "unhoused_1", …

3. Remove duplicate rows

3.1 Finding dupes

This shows both duplicates, so there are actually 219 (438/2) pairs of duplicates.

unhoused_merged |> 
   get_dupes(!c(data_origin, defendant_address))

3.2 Removing dupes

Proof it worked: subtract the number of duplicate pairs (219) from the total number of rows (31,626), that is how many rows are outputted below (31,407).

unhoused_no_dupes <- unhoused_merged |>
  arrange(desc(data_origin)) |> # distinct() keeps the first row between two duplicates, so arranging it in descending order by data_origin, it makes sure we keep the most recent row
  distinct(across(!c(
    data_origin, 
    defendant_address)),
    .keep_all = TRUE)

unhoused_no_dupes |> glimpse()
Rows: 31,407
Columns: 16
$ citation_number             <chr> "N40222481", "N40222497", "M81024487", "N4…
$ case_number                 <chr> "N40222481-01 NT 2025", "N40222497-01 NT 2…
$ offense_date                <dttm> 2025-10-01, 2025-10-01, 2025-10-01, 2025-…
$ defendant_name              <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR",…
$ defendant_address           <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIGAT…
$ phone_number                <chr> NA, NA, "3468224952", NA, "3468224952", NA…
$ offense_location            <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WHEE…
$ violation_code              <chr> "CC940", "CC940", "CC940", "CC940", "CC941…
$ violation_description       <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS S…
$ total_fine_amount_due       <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0, 2…
$ total_fine_amount_paid      <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0.0…
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, 0,…
$ judgment_date               <dttm> 2025-12-08, 2025-12-08, 2026-01-08, 2025-…
$ judgment                    <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAILUR…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIAL …
$ data_origin                 <chr> "unhoused_5", "unhoused_5", "unhoused_5", …

4. Geocoding

4.1 Reformatting addresses

Reformatting addresses to prepare to geocode externally using geocodeio.

unhoused_reformatted <- unhoused_no_dupes |>
  mutate(full_address = paste(str_remove(offense_location, ","),
                              "Houston, TX", sep = ", "))

unhoused_reformatted |> glimpse()
Rows: 31,407
Columns: 17
$ citation_number             <chr> "N40222481", "N40222497", "M81024487", "N4…
$ case_number                 <chr> "N40222481-01 NT 2025", "N40222497-01 NT 2…
$ offense_date                <dttm> 2025-10-01, 2025-10-01, 2025-10-01, 2025-…
$ defendant_name              <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR",…
$ defendant_address           <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIGAT…
$ phone_number                <chr> NA, NA, "3468224952", NA, "3468224952", NA…
$ offense_location            <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WHEE…
$ violation_code              <chr> "CC940", "CC940", "CC940", "CC940", "CC941…
$ violation_description       <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS S…
$ total_fine_amount_due       <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0, 2…
$ total_fine_amount_paid      <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0.0…
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, 0,…
$ judgment_date               <dttm> 2025-12-08, 2025-12-08, 2026-01-08, 2025-…
$ judgment                    <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAILUR…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIAL …
$ data_origin                 <chr> "unhoused_5", "unhoused_5", "unhoused_5", …
$ full_address                <chr> "400 MCKINNEY, Houston, TX", "400 MCKINNEY…

4.2 Geocoding sets 1-4

4.2.1 Isolating sets 1-4

Isolating data sets from 1-4 to geocode.

unhoused_to_geocode_1 <- unhoused_reformatted |> 
  filter(!c(data_origin == "unhoused_5"))

unhoused_to_geocode_1 |> glimpse()
Rows: 30,392
Columns: 17
$ citation_number             <chr> "N40168816", "N40395117", "N40363023", "M8…
$ case_number                 <chr> "N40168816-03 NT 2025", "N40395117-01 NT 2…
$ offense_date                <dttm> 2025-10-15, 2025-10-15, 2025-10-15, 2025-…
$ defendant_name              <chr> "AMAYA, WILMER", "GARCIA, DANIEL", "SPIVEY…
$ defendant_address           <chr> "6727 TELEPHONE RD #330  HOUSTON,77061 TX"…
$ phone_number                <chr> "8329403301", NA, NA, NA, NA, NA, NA, NA, …
$ offense_location            <chr> "8900 TOWN PARK DR.", "2000 CRAWFORD", "46…
$ violation_code              <chr> "CC753", "CC940", "CC940", "CC921", "CC940…
$ violation_description       <chr> "STREET VENDOR (EXPOSE FOR SALE) (SELL) GO…
$ total_fine_amount_due       <dbl> 394, 194, 194, 184, 194, 194, 209, 294, 29…
$ total_fine_amount_paid      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ judgment_date               <dttm> 2025-11-18, 2025-11-20, 2025-11-14, 2025-…
$ judgment                    <chr> "TRIAL (NON JURY)", "FAILURE TO APPEAR (FT…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "UNDISPOSED", …
$ data_origin                 <chr> "unhoused_4", "unhoused_4", "unhoused_4", …
$ full_address                <chr> "8900 TOWN PARK DR., Houston, TX", "2000 C…

4.2.2 Exporting pre-geocode sets 1-4

Exporting data sets 1-4 to geocode in geocodio.

unhoused_to_geocode_1 |>
  write_csv("data-processed/unhoused-to-geocode-1.csv")

4.2.3 Importing geocoded

Importing the latest batch of geocoded data from geocodio and cleaning column names. @NOTE: There are more rows here than previously exported because this batch was geocoded before receiving fifth data set. unhoused_4 decreased in rows after introducing unhoused_5 because there was overlap between the two, and we kept most updated data in unhoused_5 — so no rows were lost, just transferred. This just means it will be necessary to remove duplicates again after merging in 4.4.

unhoused_geocoded_1 <- read_csv("data-processed/unhoused-geocoded-1.csv") |> 
  clean_names() # the new geocoded columns didn't have clean names
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 30522 Columns: 34
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (21): citation_number, case_number, defendant_name, defendant_address, ...
dbl   (8): phone_number, total_fine_amount_due, total_fine_amount_paid, tota...
lgl   (3): Geocodio Address Line 2, Geocodio Unit Type, Geocodio Unit Number
dttm  (2): offense_date, judgment_date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unhoused_geocoded_1 |> glimpse()
Rows: 30,522
Columns: 34
$ citation_number             <chr> "N40168816", "N40373496", "N40395117", "N4…
$ case_number                 <chr> "N40168816-03 NT 2025", "N40373496-01 NT 2…
$ offense_date                <dttm> 2025-10-15, 2025-10-15, 2025-10-15, 2025-…
$ defendant_name              <chr> "AMAYA, WILMER", "CROSBY, SELWYN", "GARCIA…
$ defendant_address           <chr> "6727 TELEPHONE RD #330  HOUSTON,77061 TX"…
$ phone_number                <dbl> 8329403301, 2815288883, NA, NA, NA, NA, NA…
$ offense_location            <chr> "8900 TOWN PARK DR.", "2100 PRESTON ST", "…
$ violation_code              <chr> "CC753", "CC940", "CC940", "CC940", "CC921…
$ violation_description       <chr> "STREET VENDOR (EXPOSE FOR SALE) (SELL) GO…
$ total_fine_amount_due       <dbl> 394, 194, 194, 194, 184, 194, 194, 209, 29…
$ total_fine_amount_paid      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ judgment_date               <dttm> 2025-11-18, 2025-11-19, 2025-11-20, 2025-…
$ judgment                    <chr> "TRIAL (NON JURY)", "FAILURE TO APPEAR", "…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "UNDISPOSED", …
$ data_origin                 <chr> "unhoused_4", "unhoused_4", "unhoused_4", …
$ full_address                <chr> "8900 TOWN PARK DR., Houston, TX", "2100 P…
$ geocodio_latitude           <dbl> 29.71419, 29.75580, 29.74602, 29.73307, 29…
$ geocodio_longitude          <dbl> -95.54310, -95.35188, -95.36641, -95.38406…
$ geocodio_accuracy_score     <dbl> 1.00, 1.00, 0.93, 1.00, 1.00, 1.00, 1.00, …
$ geocodio_accuracy_type      <chr> "range_interpolation", "range_interpolatio…
$ geocodio_address_line_1     <chr> "8900 Town Park Dr", "2100 Preston St", "2…
$ geocodio_address_line_2     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_address_line_3     <chr> "Houston, TX 77036", "Houston, TX 77002", …
$ geocodio_house_number       <chr> "8900", "2100", "2000", "4600", "2300", "1…
$ geocodio_street             <chr> "Town Park Dr", "Preston St", "Crawford St…
$ geocodio_unit_type          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_unit_number        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_city               <chr> "Houston", "Houston", "Houston", "Houston"…
$ geocodio_state              <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX", …
$ geocodio_county             <chr> "Harris County", "Harris County", "Harris …
$ geocodio_postal_code        <dbl> 77036, 77002, 77002, 77002, 77009, 77002, …
$ geocodio_country            <chr> "US", "US", "US", "US", "US", "US", "US", …
$ geocodio_source             <chr> "TIGER/Line® from the US Census Bureau", "…

4.3 Geocoding set 5

4.2.1 Isolating set 5

Isolating data set 5 to geocode.

unhoused_to_geocode_2 <- unhoused_reformatted |> 
  filter(data_origin == "unhoused_5")

unhoused_to_geocode_2 |> glimpse()
Rows: 1,015
Columns: 17
$ citation_number             <chr> "N40222481", "N40222497", "M81024487", "N4…
$ case_number                 <chr> "N40222481-01 NT 2025", "N40222497-01 NT 2…
$ offense_date                <dttm> 2025-10-01, 2025-10-01, 2025-10-01, 2025-…
$ defendant_name              <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR",…
$ defendant_address           <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIGAT…
$ phone_number                <chr> NA, NA, "3468224952", NA, "3468224952", NA…
$ offense_location            <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WHEE…
$ violation_code              <chr> "CC940", "CC940", "CC940", "CC940", "CC941…
$ violation_description       <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS S…
$ total_fine_amount_due       <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0, 2…
$ total_fine_amount_paid      <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0.0…
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, 0,…
$ judgment_date               <dttm> 2025-12-08, 2025-12-08, 2026-01-08, 2025-…
$ judgment                    <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAILUR…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIAL …
$ data_origin                 <chr> "unhoused_5", "unhoused_5", "unhoused_5", …
$ full_address                <chr> "400 MCKINNEY, Houston, TX", "400 MCKINNEY…

4.2.2 Exporting pre-geocode

Exporting data set 5 to geocode in geocodio.

unhoused_to_geocode_2 |>
  write_csv("data-processed/unhoused-to-geocode-2.csv")

4.2.3 Importing geocoded

Importing geocoded data set 5 from geocodio and cleaning column names.

unhoused_geocoded_2 <- read_csv("data-processed/unhoused-geocoded-2.csv") |> 
  clean_names() # the new geocoded columns didn't have clean names
Rows: 1015 Columns: 34
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (22): citation_number, case_number, defendant_name, defendant_address, ...
dbl   (7): total_fine_amount_due, total_fine_amount_paid, total_fine_amount_...
lgl   (3): Geocodio Address Line 2, Geocodio Unit Type, Geocodio Unit Number
dttm  (2): offense_date, judgment_date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unhoused_geocoded_2 |> glimpse()
Rows: 1,015
Columns: 34
$ citation_number             <chr> "N40222481", "N40222497", "M81024487", "N4…
$ case_number                 <chr> "N40222481-01 NT 2025", "N40222497-01 NT 2…
$ offense_date                <dttm> 2025-10-01, 2025-10-01, 2025-10-01, 2025-…
$ defendant_name              <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR",…
$ defendant_address           <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIGAT…
$ phone_number                <chr> NA, NA, "3468224952", NA, "3468224952", NA…
$ offense_location            <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WHEE…
$ violation_code              <chr> "CC940", "CC940", "CC940", "CC940", "CC941…
$ violation_description       <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS S…
$ total_fine_amount_due       <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0, 2…
$ total_fine_amount_paid      <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0.0…
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, 0,…
$ judgment_date               <dttm> 2025-12-08, 2025-12-08, 2026-01-08, 2025-…
$ judgment                    <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAILUR…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIAL …
$ data_origin                 <chr> "unhoused_5", "unhoused_5", "unhoused_5", …
$ full_address                <chr> "400 MCKINNEY, Houston, TX", "400 MCKINNEY…
$ geocodio_latitude           <dbl> 29.76007, 29.76007, 29.73310, 29.73422, 29…
$ geocodio_longitude          <dbl> -95.37008, -95.37008, -95.38127, -95.30282…
$ geocodio_accuracy_score     <dbl> 0.92, 0.92, 1.00, 1.00, 1.00, 1.00, 1.00, …
$ geocodio_accuracy_type      <chr> "range_interpolation", "range_interpolatio…
$ geocodio_address_line_1     <chr> "400 McKinney St", "400 McKinney St", "120…
$ geocodio_address_line_2     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_address_line_3     <chr> "Houston, TX 77002", "Houston, TX 77002", …
$ geocodio_house_number       <chr> "400", "400", "1200", "7000", "1200", "230…
$ geocodio_street             <chr> "McKinney St", "McKinney St", "Wheeler St"…
$ geocodio_unit_type          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_unit_number        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_city               <chr> "Houston", "Houston", "Houston", "Houston"…
$ geocodio_state              <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX", …
$ geocodio_county             <chr> "Harris County", "Harris County", "Harris …
$ geocodio_postal_code        <dbl> 77002, 77002, 77004, 77011, 77004, 77020, …
$ geocodio_country            <chr> "US", "US", "US", "US", "US", "US", "US", …
$ geocodio_source             <chr> "TIGER/Line® from the US Census Bureau", "…

4.4 Merging

Merging the latest batch of geocoded data with the rest of the data set.

4.4.1 Preparing to merge

The data sets have mismatched phone_number data types, this fixes that.

unhoused_geocoded_1_fixed <- unhoused_geocoded_1 |> 
  mutate(phone_number = as.character(phone_number))
         
unhoused_geocoded_1_fixed |> glimpse()
Rows: 30,522
Columns: 34
$ citation_number             <chr> "N40168816", "N40373496", "N40395117", "N4…
$ case_number                 <chr> "N40168816-03 NT 2025", "N40373496-01 NT 2…
$ offense_date                <dttm> 2025-10-15, 2025-10-15, 2025-10-15, 2025-…
$ defendant_name              <chr> "AMAYA, WILMER", "CROSBY, SELWYN", "GARCIA…
$ defendant_address           <chr> "6727 TELEPHONE RD #330  HOUSTON,77061 TX"…
$ phone_number                <chr> "8329403301", "2815288883", NA, NA, NA, NA…
$ offense_location            <chr> "8900 TOWN PARK DR.", "2100 PRESTON ST", "…
$ violation_code              <chr> "CC753", "CC940", "CC940", "CC940", "CC921…
$ violation_description       <chr> "STREET VENDOR (EXPOSE FOR SALE) (SELL) GO…
$ total_fine_amount_due       <dbl> 394, 194, 194, 194, 184, 194, 194, 209, 29…
$ total_fine_amount_paid      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ judgment_date               <dttm> 2025-11-18, 2025-11-19, 2025-11-20, 2025-…
$ judgment                    <chr> "TRIAL (NON JURY)", "FAILURE TO APPEAR", "…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "UNDISPOSED", …
$ data_origin                 <chr> "unhoused_4", "unhoused_4", "unhoused_4", …
$ full_address                <chr> "8900 TOWN PARK DR., Houston, TX", "2100 P…
$ geocodio_latitude           <dbl> 29.71419, 29.75580, 29.74602, 29.73307, 29…
$ geocodio_longitude          <dbl> -95.54310, -95.35188, -95.36641, -95.38406…
$ geocodio_accuracy_score     <dbl> 1.00, 1.00, 0.93, 1.00, 1.00, 1.00, 1.00, …
$ geocodio_accuracy_type      <chr> "range_interpolation", "range_interpolatio…
$ geocodio_address_line_1     <chr> "8900 Town Park Dr", "2100 Preston St", "2…
$ geocodio_address_line_2     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_address_line_3     <chr> "Houston, TX 77036", "Houston, TX 77002", …
$ geocodio_house_number       <chr> "8900", "2100", "2000", "4600", "2300", "1…
$ geocodio_street             <chr> "Town Park Dr", "Preston St", "Crawford St…
$ geocodio_unit_type          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_unit_number        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_city               <chr> "Houston", "Houston", "Houston", "Houston"…
$ geocodio_state              <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX", …
$ geocodio_county             <chr> "Harris County", "Harris County", "Harris …
$ geocodio_postal_code        <dbl> 77036, 77002, 77002, 77002, 77009, 77002, …
$ geocodio_country            <chr> "US", "US", "US", "US", "US", "US", "US", …
$ geocodio_source             <chr> "TIGER/Line® from the US Census Bureau", "…

4.4.2 Merging

unhoused_geocoded_merged <- bind_rows(unhoused_geocoded_1_fixed, unhoused_geocoded_2)

unhoused_geocoded_merged |> glimpse()  
Rows: 31,537
Columns: 34
$ citation_number             <chr> "N40168816", "N40373496", "N40395117", "N4…
$ case_number                 <chr> "N40168816-03 NT 2025", "N40373496-01 NT 2…
$ offense_date                <dttm> 2025-10-15, 2025-10-15, 2025-10-15, 2025-…
$ defendant_name              <chr> "AMAYA, WILMER", "CROSBY, SELWYN", "GARCIA…
$ defendant_address           <chr> "6727 TELEPHONE RD #330  HOUSTON,77061 TX"…
$ phone_number                <chr> "8329403301", "2815288883", NA, NA, NA, NA…
$ offense_location            <chr> "8900 TOWN PARK DR.", "2100 PRESTON ST", "…
$ violation_code              <chr> "CC753", "CC940", "CC940", "CC940", "CC921…
$ violation_description       <chr> "STREET VENDOR (EXPOSE FOR SALE) (SELL) GO…
$ total_fine_amount_due       <dbl> 394, 194, 194, 194, 184, 194, 194, 209, 29…
$ total_fine_amount_paid      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ judgment_date               <dttm> 2025-11-18, 2025-11-19, 2025-11-20, 2025-…
$ judgment                    <chr> "TRIAL (NON JURY)", "FAILURE TO APPEAR", "…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "UNDISPOSED", …
$ data_origin                 <chr> "unhoused_4", "unhoused_4", "unhoused_4", …
$ full_address                <chr> "8900 TOWN PARK DR., Houston, TX", "2100 P…
$ geocodio_latitude           <dbl> 29.71419, 29.75580, 29.74602, 29.73307, 29…
$ geocodio_longitude          <dbl> -95.54310, -95.35188, -95.36641, -95.38406…
$ geocodio_accuracy_score     <dbl> 1.00, 1.00, 0.93, 1.00, 1.00, 1.00, 1.00, …
$ geocodio_accuracy_type      <chr> "range_interpolation", "range_interpolatio…
$ geocodio_address_line_1     <chr> "8900 Town Park Dr", "2100 Preston St", "2…
$ geocodio_address_line_2     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_address_line_3     <chr> "Houston, TX 77036", "Houston, TX 77002", …
$ geocodio_house_number       <chr> "8900", "2100", "2000", "4600", "2300", "1…
$ geocodio_street             <chr> "Town Park Dr", "Preston St", "Crawford St…
$ geocodio_unit_type          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_unit_number        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_city               <chr> "Houston", "Houston", "Houston", "Houston"…
$ geocodio_state              <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX", …
$ geocodio_county             <chr> "Harris County", "Harris County", "Harris …
$ geocodio_postal_code        <dbl> 77036, 77002, 77002, 77002, 77009, 77002, …
$ geocodio_country            <chr> "US", "US", "US", "US", "US", "US", "US", …
$ geocodio_source             <chr> "TIGER/Line® from the US Census Bureau", "…

4.5 Removing duplicates

See 4.2.3 for explanation on why this is necessary again.

4.5.1 Finding dupes

This shows both duplicates, so there are actually 130 (260/2) pairs of duplicates.

unhoused_geocoded_merged |> 
   get_dupes(!c(data_origin, defendant_address))

4.5.2 Removing dupes

Proof it worked: subtract the number of duplicate pairs (130) from the total number of rows (31,537), that is how many rows are outputted below (31,407).

unhoused_geocoded_no_dupes <- unhoused_geocoded_merged |>
  arrange(desc(data_origin)) |> # distinct() keeps the first row between two duplicates, so arranging it in descending order by data_origin, it makes sure we keep the most recent row
  distinct(across(!c(
    data_origin, 
    defendant_address)),
    .keep_all = TRUE)

unhoused_geocoded_no_dupes |> glimpse()
Rows: 31,407
Columns: 34
$ citation_number             <chr> "N40222481", "N40222497", "M81024487", "N4…
$ case_number                 <chr> "N40222481-01 NT 2025", "N40222497-01 NT 2…
$ offense_date                <dttm> 2025-10-01, 2025-10-01, 2025-10-01, 2025-…
$ defendant_name              <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR",…
$ defendant_address           <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIGAT…
$ phone_number                <chr> NA, NA, "3468224952", NA, "3468224952", NA…
$ offense_location            <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WHEE…
$ violation_code              <chr> "CC940", "CC940", "CC940", "CC940", "CC941…
$ violation_description       <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS S…
$ total_fine_amount_due       <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0, 2…
$ total_fine_amount_paid      <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0.0…
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, 0,…
$ judgment_date               <dttm> 2025-12-08, 2025-12-08, 2026-01-08, 2025-…
$ judgment                    <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAILUR…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIAL …
$ data_origin                 <chr> "unhoused_5", "unhoused_5", "unhoused_5", …
$ full_address                <chr> "400 MCKINNEY, Houston, TX", "400 MCKINNEY…
$ geocodio_latitude           <dbl> 29.76007, 29.76007, 29.73310, 29.73422, 29…
$ geocodio_longitude          <dbl> -95.37008, -95.37008, -95.38127, -95.30282…
$ geocodio_accuracy_score     <dbl> 0.92, 0.92, 1.00, 1.00, 1.00, 1.00, 1.00, …
$ geocodio_accuracy_type      <chr> "range_interpolation", "range_interpolatio…
$ geocodio_address_line_1     <chr> "400 McKinney St", "400 McKinney St", "120…
$ geocodio_address_line_2     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_address_line_3     <chr> "Houston, TX 77002", "Houston, TX 77002", …
$ geocodio_house_number       <chr> "400", "400", "1200", "7000", "1200", "230…
$ geocodio_street             <chr> "McKinney St", "McKinney St", "Wheeler St"…
$ geocodio_unit_type          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_unit_number        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_city               <chr> "Houston", "Houston", "Houston", "Houston"…
$ geocodio_state              <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX", …
$ geocodio_county             <chr> "Harris County", "Harris County", "Harris …
$ geocodio_postal_code        <dbl> 77002, 77002, 77004, 77011, 77004, 77020, …
$ geocodio_country            <chr> "US", "US", "US", "US", "US", "US", "US", …
$ geocodio_source             <chr> "TIGER/Line® from the US Census Bureau", "…

5. OpenRefine

Exporting cleaned data to clean further in OpenRefine. Everything accomplished in OpenRefine is listed in section 5.2.

5.1 Exporting for OpenRefine

Exporting clean data to clean more in OpenRefine.

unhoused_to_refine <- unhoused_geocoded_no_dupes |> 
  write_csv("data-processed/unhoused-to-refine.csv")

unhoused_to_refine |> glimpse()
Rows: 31,407
Columns: 34
$ citation_number             <chr> "N40222481", "N40222497", "M81024487", "N4…
$ case_number                 <chr> "N40222481-01 NT 2025", "N40222497-01 NT 2…
$ offense_date                <dttm> 2025-10-01, 2025-10-01, 2025-10-01, 2025-…
$ defendant_name              <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR",…
$ defendant_address           <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIGAT…
$ phone_number                <chr> NA, NA, "3468224952", NA, "3468224952", NA…
$ offense_location            <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WHEE…
$ violation_code              <chr> "CC940", "CC940", "CC940", "CC940", "CC941…
$ violation_description       <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS S…
$ total_fine_amount_due       <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0, 2…
$ total_fine_amount_paid      <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0.0…
$ total_fine_amount_dismissed <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, 0,…
$ judgment_date               <dttm> 2025-12-08, 2025-12-08, 2026-01-08, 2025-…
$ judgment                    <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAILUR…
$ disposition                 <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIAL …
$ data_origin                 <chr> "unhoused_5", "unhoused_5", "unhoused_5", …
$ full_address                <chr> "400 MCKINNEY, Houston, TX", "400 MCKINNEY…
$ geocodio_latitude           <dbl> 29.76007, 29.76007, 29.73310, 29.73422, 29…
$ geocodio_longitude          <dbl> -95.37008, -95.37008, -95.38127, -95.30282…
$ geocodio_accuracy_score     <dbl> 0.92, 0.92, 1.00, 1.00, 1.00, 1.00, 1.00, …
$ geocodio_accuracy_type      <chr> "range_interpolation", "range_interpolatio…
$ geocodio_address_line_1     <chr> "400 McKinney St", "400 McKinney St", "120…
$ geocodio_address_line_2     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_address_line_3     <chr> "Houston, TX 77002", "Houston, TX 77002", …
$ geocodio_house_number       <chr> "400", "400", "1200", "7000", "1200", "230…
$ geocodio_street             <chr> "McKinney St", "McKinney St", "Wheeler St"…
$ geocodio_unit_type          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_unit_number        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ geocodio_city               <chr> "Houston", "Houston", "Houston", "Houston"…
$ geocodio_state              <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX", …
$ geocodio_county             <chr> "Harris County", "Harris County", "Harris …
$ geocodio_postal_code        <dbl> 77002, 77002, 77004, 77011, 77004, 77020, …
$ geocodio_country            <chr> "US", "US", "US", "US", "US", "US", "US", …
$ geocodio_source             <chr> "TIGER/Line® from the US Census Bureau", "…

5.2 Importing from OpenRefine

Importing data from OpenRefine. Here are the changes made:

  • Clustered misspelled and varied names.
  • Created a column that standardizes some judgments more generally.
  • Created a column that standardizes some dispositions more generally.
  • Created a column indicating whether someone has served jail time.
  • Removed all 16 “test” rows.
  • Trimmed all leading and trailing whitespace for all columns.
  • Made all cell entries uppercase.
  • Created a column that standardizes violation descriptions, because starting in October 2025, CC940 and 941 have different violation descriptions than they previously did.
unhoused_refined <- read_csv("data-processed/unhoused-refined.csv")
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 31391 Columns: 39
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (25): citation_number, case_number, defendant_name, refined_name, defen...
dbl   (9): phone_number, total_fine_amount_due, total_fine_amount_paid, tota...
lgl   (3): geocodio_address_line_2, geocodio_unit_type, geocodio_unit_number
dttm  (2): offense_date, judgment_date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unhoused_refined |> glimpse()
Rows: 31,391
Columns: 39
$ citation_number               <chr> "N40222481", "N40222497", "M81024487", "…
$ case_number                   <chr> "N40222481-01 NT 2025", "N40222497-01 NT…
$ offense_date                  <dttm> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ defendant_name                <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ refined_name                  <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ defendant_address             <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIG…
$ phone_number                  <dbl> NA, NA, 3468224952, NA, 3468224952, NA, …
$ offense_location              <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WH…
$ violation_code                <chr> "CC940", "CC940", "CC940", "CC940", "CC9…
$ violation_description         <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS…
$ violation_description_refined <chr> "SIT/LIE DOWN ON (BLANKET/STOOL) PLACED …
$ total_fine_amount_due         <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0,…
$ total_fine_amount_paid        <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0…
$ total_fine_amount_dismissed   <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, …
$ judgment_date                 <dttm> 2025-12-08, 2025-12-08, 2026-01-08, 202…
$ time_served                   <chr> "NO", "NO", "YES", "NO", "YES", "NO", "N…
$ judgment_refined              <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ judgment                      <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ disposition_refined           <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ disposition                   <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ data_origin                   <chr> "UNHOUSED_5", "UNHOUSED_5", "UNHOUSED_5"…
$ full_address                  <chr> "400 MCKINNEY, HOUSTON, TX", "400 MCKINN…
$ geocodio_latitude             <dbl> 29.76007, 29.76007, 29.73310, 29.73422, …
$ geocodio_longitude            <dbl> -95.37008, -95.37008, -95.38127, -95.302…
$ geocodio_accuracy_score       <dbl> 0.92, 0.92, 1.00, 1.00, 1.00, 1.00, 1.00…
$ geocodio_accuracy_type        <chr> "RANGE_INTERPOLATION", "RANGE_INTERPOLAT…
$ geocodio_address_line_1       <chr> "400 MCKINNEY ST", "400 MCKINNEY ST", "1…
$ geocodio_address_line_2       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_address_line_3       <chr> "HOUSTON, TX 77002", "HOUSTON, TX 77002"…
$ geocodio_house_number         <dbl> 400, 400, 1200, 7000, 1200, 2300, 2111, …
$ geocodio_street               <chr> "MCKINNEY ST", "MCKINNEY ST", "WHEELER S…
$ geocodio_unit_type            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_unit_number          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_city                 <chr> "HOUSTON", "HOUSTON", "HOUSTON", "HOUSTO…
$ geocodio_state                <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX"…
$ geocodio_county               <chr> "HARRIS COUNTY", "HARRIS COUNTY", "HARRI…
$ geocodio_postal_code          <dbl> 77002, 77002, 77004, 77011, 77004, 77020…
$ geocodio_country              <chr> "US", "US", "US", "US", "US", "US", "US"…
$ geocodio_source               <chr> "TIGER/LINE® FROM THE US CENSUS BUREAU",…

6. Fixing and grouping dates

6.1 Fixing data types

Changing offense_date and judgment_date data types from “dttm” to “date.”

unhoused_data_types <- unhoused_refined |>
  mutate(offense_date = as_date(offense_date),
         judgment_date = as_date(judgment_date),
        )

unhoused_data_types |> glimpse()
Rows: 31,391
Columns: 39
$ citation_number               <chr> "N40222481", "N40222497", "M81024487", "…
$ case_number                   <chr> "N40222481-01 NT 2025", "N40222497-01 NT…
$ offense_date                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ defendant_name                <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ refined_name                  <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ defendant_address             <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIG…
$ phone_number                  <dbl> NA, NA, 3468224952, NA, 3468224952, NA, …
$ offense_location              <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WH…
$ violation_code                <chr> "CC940", "CC940", "CC940", "CC940", "CC9…
$ violation_description         <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS…
$ violation_description_refined <chr> "SIT/LIE DOWN ON (BLANKET/STOOL) PLACED …
$ total_fine_amount_due         <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0,…
$ total_fine_amount_paid        <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0…
$ total_fine_amount_dismissed   <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, …
$ judgment_date                 <date> 2025-12-08, 2025-12-08, 2026-01-08, 202…
$ time_served                   <chr> "NO", "NO", "YES", "NO", "YES", "NO", "N…
$ judgment_refined              <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ judgment                      <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ disposition_refined           <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ disposition                   <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ data_origin                   <chr> "UNHOUSED_5", "UNHOUSED_5", "UNHOUSED_5"…
$ full_address                  <chr> "400 MCKINNEY, HOUSTON, TX", "400 MCKINN…
$ geocodio_latitude             <dbl> 29.76007, 29.76007, 29.73310, 29.73422, …
$ geocodio_longitude            <dbl> -95.37008, -95.37008, -95.38127, -95.302…
$ geocodio_accuracy_score       <dbl> 0.92, 0.92, 1.00, 1.00, 1.00, 1.00, 1.00…
$ geocodio_accuracy_type        <chr> "RANGE_INTERPOLATION", "RANGE_INTERPOLAT…
$ geocodio_address_line_1       <chr> "400 MCKINNEY ST", "400 MCKINNEY ST", "1…
$ geocodio_address_line_2       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_address_line_3       <chr> "HOUSTON, TX 77002", "HOUSTON, TX 77002"…
$ geocodio_house_number         <dbl> 400, 400, 1200, 7000, 1200, 2300, 2111, …
$ geocodio_street               <chr> "MCKINNEY ST", "MCKINNEY ST", "WHEELER S…
$ geocodio_unit_type            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_unit_number          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_city                 <chr> "HOUSTON", "HOUSTON", "HOUSTON", "HOUSTO…
$ geocodio_state                <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX"…
$ geocodio_county               <chr> "HARRIS COUNTY", "HARRIS COUNTY", "HARRI…
$ geocodio_postal_code          <dbl> 77002, 77002, 77004, 77011, 77004, 77020…
$ geocodio_country              <chr> "US", "US", "US", "US", "US", "US", "US"…
$ geocodio_source               <chr> "TIGER/LINE® FROM THE US CENSUS BUREAU",…

6.2 Grouping by date

unhoused_grouped_dates <- unhoused_data_types |> 
  mutate(off_yr = year(offense_date), # offense year
         off_mo = month(offense_date), # offense month in numbers
         off_mo_label = month(offense_date, label = TRUE), # offense month written
         off_yday = yday(offense_date), # offense day/365
         off_floor_mo = floor_date(offense_date, unit = "month"), # floor month
         off_floor_yr = floor_date(offense_date, "year"), # floor year
         jud_yr = year(judgment_date),
         jud_mo = month(judgment_date),
         jud_mo_label = month(judgment_date, label = TRUE),
         jud_yday = yday(judgment_date),
         jud_floor_mo = floor_date(judgment_date, unit = "month"),
         jud_floor_yr = floor_date(judgment_date, "year")
         )

unhoused_grouped_dates |> glimpse()
Rows: 31,391
Columns: 51
$ citation_number               <chr> "N40222481", "N40222497", "M81024487", "…
$ case_number                   <chr> "N40222481-01 NT 2025", "N40222497-01 NT…
$ offense_date                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ defendant_name                <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ refined_name                  <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ defendant_address             <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIG…
$ phone_number                  <dbl> NA, NA, 3468224952, NA, 3468224952, NA, …
$ offense_location              <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WH…
$ violation_code                <chr> "CC940", "CC940", "CC940", "CC940", "CC9…
$ violation_description         <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS…
$ violation_description_refined <chr> "SIT/LIE DOWN ON (BLANKET/STOOL) PLACED …
$ total_fine_amount_due         <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0,…
$ total_fine_amount_paid        <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0…
$ total_fine_amount_dismissed   <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, …
$ judgment_date                 <date> 2025-12-08, 2025-12-08, 2026-01-08, 202…
$ time_served                   <chr> "NO", "NO", "YES", "NO", "YES", "NO", "N…
$ judgment_refined              <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ judgment                      <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ disposition_refined           <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ disposition                   <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ data_origin                   <chr> "UNHOUSED_5", "UNHOUSED_5", "UNHOUSED_5"…
$ full_address                  <chr> "400 MCKINNEY, HOUSTON, TX", "400 MCKINN…
$ geocodio_latitude             <dbl> 29.76007, 29.76007, 29.73310, 29.73422, …
$ geocodio_longitude            <dbl> -95.37008, -95.37008, -95.38127, -95.302…
$ geocodio_accuracy_score       <dbl> 0.92, 0.92, 1.00, 1.00, 1.00, 1.00, 1.00…
$ geocodio_accuracy_type        <chr> "RANGE_INTERPOLATION", "RANGE_INTERPOLAT…
$ geocodio_address_line_1       <chr> "400 MCKINNEY ST", "400 MCKINNEY ST", "1…
$ geocodio_address_line_2       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_address_line_3       <chr> "HOUSTON, TX 77002", "HOUSTON, TX 77002"…
$ geocodio_house_number         <dbl> 400, 400, 1200, 7000, 1200, 2300, 2111, …
$ geocodio_street               <chr> "MCKINNEY ST", "MCKINNEY ST", "WHEELER S…
$ geocodio_unit_type            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_unit_number          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_city                 <chr> "HOUSTON", "HOUSTON", "HOUSTON", "HOUSTO…
$ geocodio_state                <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX"…
$ geocodio_county               <chr> "HARRIS COUNTY", "HARRIS COUNTY", "HARRI…
$ geocodio_postal_code          <dbl> 77002, 77002, 77004, 77011, 77004, 77020…
$ geocodio_country              <chr> "US", "US", "US", "US", "US", "US", "US"…
$ geocodio_source               <chr> "TIGER/LINE® FROM THE US CENSUS BUREAU",…
$ off_yr                        <dbl> 2025, 2025, 2025, 2025, 2025, 2025, 2025…
$ off_mo                        <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ off_mo_label                  <ord> Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, …
$ off_yday                      <dbl> 274, 274, 274, 274, 274, 274, 275, 275, …
$ off_floor_mo                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ off_floor_yr                  <date> 2025-01-01, 2025-01-01, 2025-01-01, 202…
$ jud_yr                        <dbl> 2025, 2025, 2026, 2025, 2026, 2025, 2025…
$ jud_mo                        <dbl> 12, 12, 1, 11, 1, 10, 12, 11, 11, 11, 11…
$ jud_mo_label                  <ord> Dec, Dec, Jan, Nov, Jan, Oct, Dec, Nov, …
$ jud_yday                      <dbl> 342, 342, 8, 329, 8, 275, 339, 317, 317,…
$ jud_floor_mo                  <date> 2025-12-01, 2025-12-01, 2026-01-01, 202…
$ jud_floor_yr                  <date> 2025-01-01, 2025-01-01, 2026-01-01, 202…

7. Creating and renaming final columns

Creating columns to indicate whether a case is form before/after Whitmire’s term began, what type of violation ordinance a violation code is, and a column detailing the amount fined. The final step also renames and selects final columns.

7.1 Whitmire flag

Whitmire’s term began in January 2024. These columns are separated into offense and judgment because there may be cases where a citation was given before Whitmire’s term began, but didn’t receive a judgment until after.

7.1.2 Offense date

Creating a column indicating whether an offense occurred before or after Whitmire’s term began.

unhoused_whitmire_offense <- unhoused_grouped_dates |>
  mutate(
    whitmire_offense = if_else( #if_else function creates a new column based on its argument. it instructs that if the argument is true, use "before," if it's false, use "after." this makes sure we include Jan. 1 2024 in "after"
      offense_date < as.Date("2024-01-01"),
      "BEFORE WHITMIRE",
      "AFTER WHITMIRE"
    )
  )

unhoused_whitmire_offense |> glimpse()
Rows: 31,391
Columns: 52
$ citation_number               <chr> "N40222481", "N40222497", "M81024487", "…
$ case_number                   <chr> "N40222481-01 NT 2025", "N40222497-01 NT…
$ offense_date                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ defendant_name                <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ refined_name                  <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ defendant_address             <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIG…
$ phone_number                  <dbl> NA, NA, 3468224952, NA, 3468224952, NA, …
$ offense_location              <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WH…
$ violation_code                <chr> "CC940", "CC940", "CC940", "CC940", "CC9…
$ violation_description         <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS…
$ violation_description_refined <chr> "SIT/LIE DOWN ON (BLANKET/STOOL) PLACED …
$ total_fine_amount_due         <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0,…
$ total_fine_amount_paid        <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0…
$ total_fine_amount_dismissed   <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, …
$ judgment_date                 <date> 2025-12-08, 2025-12-08, 2026-01-08, 202…
$ time_served                   <chr> "NO", "NO", "YES", "NO", "YES", "NO", "N…
$ judgment_refined              <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ judgment                      <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ disposition_refined           <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ disposition                   <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ data_origin                   <chr> "UNHOUSED_5", "UNHOUSED_5", "UNHOUSED_5"…
$ full_address                  <chr> "400 MCKINNEY, HOUSTON, TX", "400 MCKINN…
$ geocodio_latitude             <dbl> 29.76007, 29.76007, 29.73310, 29.73422, …
$ geocodio_longitude            <dbl> -95.37008, -95.37008, -95.38127, -95.302…
$ geocodio_accuracy_score       <dbl> 0.92, 0.92, 1.00, 1.00, 1.00, 1.00, 1.00…
$ geocodio_accuracy_type        <chr> "RANGE_INTERPOLATION", "RANGE_INTERPOLAT…
$ geocodio_address_line_1       <chr> "400 MCKINNEY ST", "400 MCKINNEY ST", "1…
$ geocodio_address_line_2       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_address_line_3       <chr> "HOUSTON, TX 77002", "HOUSTON, TX 77002"…
$ geocodio_house_number         <dbl> 400, 400, 1200, 7000, 1200, 2300, 2111, …
$ geocodio_street               <chr> "MCKINNEY ST", "MCKINNEY ST", "WHEELER S…
$ geocodio_unit_type            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_unit_number          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_city                 <chr> "HOUSTON", "HOUSTON", "HOUSTON", "HOUSTO…
$ geocodio_state                <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX"…
$ geocodio_county               <chr> "HARRIS COUNTY", "HARRIS COUNTY", "HARRI…
$ geocodio_postal_code          <dbl> 77002, 77002, 77004, 77011, 77004, 77020…
$ geocodio_country              <chr> "US", "US", "US", "US", "US", "US", "US"…
$ geocodio_source               <chr> "TIGER/LINE® FROM THE US CENSUS BUREAU",…
$ off_yr                        <dbl> 2025, 2025, 2025, 2025, 2025, 2025, 2025…
$ off_mo                        <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ off_mo_label                  <ord> Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, …
$ off_yday                      <dbl> 274, 274, 274, 274, 274, 274, 275, 275, …
$ off_floor_mo                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ off_floor_yr                  <date> 2025-01-01, 2025-01-01, 2025-01-01, 202…
$ jud_yr                        <dbl> 2025, 2025, 2026, 2025, 2026, 2025, 2025…
$ jud_mo                        <dbl> 12, 12, 1, 11, 1, 10, 12, 11, 11, 11, 11…
$ jud_mo_label                  <ord> Dec, Dec, Jan, Nov, Jan, Oct, Dec, Nov, …
$ jud_yday                      <dbl> 342, 342, 8, 329, 8, 275, 339, 317, 317,…
$ jud_floor_mo                  <date> 2025-12-01, 2025-12-01, 2026-01-01, 202…
$ jud_floor_yr                  <date> 2025-01-01, 2025-01-01, 2026-01-01, 202…
$ whitmire_offense              <chr> "AFTER WHITMIRE", "AFTER WHITMIRE", "AFT…

7.1.2 Judgment date

Creating a column indicating whether a judgment occurred before or after Whitmire’s term began.

unhoused_whitmire_judgment <- unhoused_whitmire_offense |>
  mutate(
    whitmire_judgment = if_else(
      judgment_date < as.Date("2024-01-01"),
      "BEFORE WHITMIRE",
      "AFTER WHITMIRE"
    )
  )

unhoused_whitmire_judgment |> glimpse()
Rows: 31,391
Columns: 53
$ citation_number               <chr> "N40222481", "N40222497", "M81024487", "…
$ case_number                   <chr> "N40222481-01 NT 2025", "N40222497-01 NT…
$ offense_date                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ defendant_name                <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ refined_name                  <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ defendant_address             <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIG…
$ phone_number                  <dbl> NA, NA, 3468224952, NA, 3468224952, NA, …
$ offense_location              <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WH…
$ violation_code                <chr> "CC940", "CC940", "CC940", "CC940", "CC9…
$ violation_description         <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS…
$ violation_description_refined <chr> "SIT/LIE DOWN ON (BLANKET/STOOL) PLACED …
$ total_fine_amount_due         <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0,…
$ total_fine_amount_paid        <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0…
$ total_fine_amount_dismissed   <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, …
$ judgment_date                 <date> 2025-12-08, 2025-12-08, 2026-01-08, 202…
$ time_served                   <chr> "NO", "NO", "YES", "NO", "YES", "NO", "N…
$ judgment_refined              <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ judgment                      <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ disposition_refined           <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ disposition                   <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ data_origin                   <chr> "UNHOUSED_5", "UNHOUSED_5", "UNHOUSED_5"…
$ full_address                  <chr> "400 MCKINNEY, HOUSTON, TX", "400 MCKINN…
$ geocodio_latitude             <dbl> 29.76007, 29.76007, 29.73310, 29.73422, …
$ geocodio_longitude            <dbl> -95.37008, -95.37008, -95.38127, -95.302…
$ geocodio_accuracy_score       <dbl> 0.92, 0.92, 1.00, 1.00, 1.00, 1.00, 1.00…
$ geocodio_accuracy_type        <chr> "RANGE_INTERPOLATION", "RANGE_INTERPOLAT…
$ geocodio_address_line_1       <chr> "400 MCKINNEY ST", "400 MCKINNEY ST", "1…
$ geocodio_address_line_2       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_address_line_3       <chr> "HOUSTON, TX 77002", "HOUSTON, TX 77002"…
$ geocodio_house_number         <dbl> 400, 400, 1200, 7000, 1200, 2300, 2111, …
$ geocodio_street               <chr> "MCKINNEY ST", "MCKINNEY ST", "WHEELER S…
$ geocodio_unit_type            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_unit_number          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_city                 <chr> "HOUSTON", "HOUSTON", "HOUSTON", "HOUSTO…
$ geocodio_state                <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX"…
$ geocodio_county               <chr> "HARRIS COUNTY", "HARRIS COUNTY", "HARRI…
$ geocodio_postal_code          <dbl> 77002, 77002, 77004, 77011, 77004, 77020…
$ geocodio_country              <chr> "US", "US", "US", "US", "US", "US", "US"…
$ geocodio_source               <chr> "TIGER/LINE® FROM THE US CENSUS BUREAU",…
$ off_yr                        <dbl> 2025, 2025, 2025, 2025, 2025, 2025, 2025…
$ off_mo                        <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ off_mo_label                  <ord> Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, …
$ off_yday                      <dbl> 274, 274, 274, 274, 274, 274, 275, 275, …
$ off_floor_mo                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ off_floor_yr                  <date> 2025-01-01, 2025-01-01, 2025-01-01, 202…
$ jud_yr                        <dbl> 2025, 2025, 2026, 2025, 2026, 2025, 2025…
$ jud_mo                        <dbl> 12, 12, 1, 11, 1, 10, 12, 11, 11, 11, 11…
$ jud_mo_label                  <ord> Dec, Dec, Jan, Nov, Jan, Oct, Dec, Nov, …
$ jud_yday                      <dbl> 342, 342, 8, 329, 8, 275, 339, 317, 317,…
$ jud_floor_mo                  <date> 2025-12-01, 2025-12-01, 2026-01-01, 202…
$ jud_floor_yr                  <date> 2025-01-01, 2025-01-01, 2026-01-01, 202…
$ whitmire_offense              <chr> "AFTER WHITMIRE", "AFTER WHITMIRE", "AFT…
$ whitmire_judgment             <chr> "AFTER WHITMIRE", "AFTER WHITMIRE", "AFT…

7.2 Violation flag

Making a column to tag whether a citation falls under a civility ordinance vs sidewalk obstruction ordinance.

unhoused_citation_tag <- unhoused_whitmire_judgment |> 
  mutate(
    ordinance_type = ifelse(
      violation_code == "CC940" | violation_code == "CC941", "CIVILITY",
      ifelse(
        violation_code == "CC921" | violation_code == "CC942" | violation_code == "CC943" | violation_code == "CC948", "SIDEWALK",
        ifelse(
          violation_code == "CC753" | violation_code == "CC956", "OTHER",
          "OTHER"
  ))))

unhoused_citation_tag |> glimpse()
Rows: 31,391
Columns: 54
$ citation_number               <chr> "N40222481", "N40222497", "M81024487", "…
$ case_number                   <chr> "N40222481-01 NT 2025", "N40222497-01 NT…
$ offense_date                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ defendant_name                <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ refined_name                  <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ defendant_address             <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIG…
$ phone_number                  <dbl> NA, NA, 3468224952, NA, 3468224952, NA, …
$ offense_location              <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WH…
$ violation_code                <chr> "CC940", "CC940", "CC940", "CC940", "CC9…
$ violation_description         <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS…
$ violation_description_refined <chr> "SIT/LIE DOWN ON (BLANKET/STOOL) PLACED …
$ total_fine_amount_due         <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0,…
$ total_fine_amount_paid        <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0…
$ total_fine_amount_dismissed   <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, …
$ judgment_date                 <date> 2025-12-08, 2025-12-08, 2026-01-08, 202…
$ time_served                   <chr> "NO", "NO", "YES", "NO", "YES", "NO", "N…
$ judgment_refined              <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ judgment                      <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ disposition_refined           <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ disposition                   <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ data_origin                   <chr> "UNHOUSED_5", "UNHOUSED_5", "UNHOUSED_5"…
$ full_address                  <chr> "400 MCKINNEY, HOUSTON, TX", "400 MCKINN…
$ geocodio_latitude             <dbl> 29.76007, 29.76007, 29.73310, 29.73422, …
$ geocodio_longitude            <dbl> -95.37008, -95.37008, -95.38127, -95.302…
$ geocodio_accuracy_score       <dbl> 0.92, 0.92, 1.00, 1.00, 1.00, 1.00, 1.00…
$ geocodio_accuracy_type        <chr> "RANGE_INTERPOLATION", "RANGE_INTERPOLAT…
$ geocodio_address_line_1       <chr> "400 MCKINNEY ST", "400 MCKINNEY ST", "1…
$ geocodio_address_line_2       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_address_line_3       <chr> "HOUSTON, TX 77002", "HOUSTON, TX 77002"…
$ geocodio_house_number         <dbl> 400, 400, 1200, 7000, 1200, 2300, 2111, …
$ geocodio_street               <chr> "MCKINNEY ST", "MCKINNEY ST", "WHEELER S…
$ geocodio_unit_type            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_unit_number          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_city                 <chr> "HOUSTON", "HOUSTON", "HOUSTON", "HOUSTO…
$ geocodio_state                <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX"…
$ geocodio_county               <chr> "HARRIS COUNTY", "HARRIS COUNTY", "HARRI…
$ geocodio_postal_code          <dbl> 77002, 77002, 77004, 77011, 77004, 77020…
$ geocodio_country              <chr> "US", "US", "US", "US", "US", "US", "US"…
$ geocodio_source               <chr> "TIGER/LINE® FROM THE US CENSUS BUREAU",…
$ off_yr                        <dbl> 2025, 2025, 2025, 2025, 2025, 2025, 2025…
$ off_mo                        <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ off_mo_label                  <ord> Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, …
$ off_yday                      <dbl> 274, 274, 274, 274, 274, 274, 275, 275, …
$ off_floor_mo                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ off_floor_yr                  <date> 2025-01-01, 2025-01-01, 2025-01-01, 202…
$ jud_yr                        <dbl> 2025, 2025, 2026, 2025, 2026, 2025, 2025…
$ jud_mo                        <dbl> 12, 12, 1, 11, 1, 10, 12, 11, 11, 11, 11…
$ jud_mo_label                  <ord> Dec, Dec, Jan, Nov, Jan, Oct, Dec, Nov, …
$ jud_yday                      <dbl> 342, 342, 8, 329, 8, 275, 339, 317, 317,…
$ jud_floor_mo                  <date> 2025-12-01, 2025-12-01, 2026-01-01, 202…
$ jud_floor_yr                  <date> 2025-01-01, 2025-01-01, 2026-01-01, 202…
$ whitmire_offense              <chr> "AFTER WHITMIRE", "AFTER WHITMIRE", "AFT…
$ whitmire_judgment             <chr> "AFTER WHITMIRE", "AFTER WHITMIRE", "AFT…
$ ordinance_type                <chr> "CIVILITY", "CIVILITY", "CIVILITY", "CIV…

7.3 Amount fined

unhoused_fined <- unhoused_citation_tag |>
  mutate(amount_fined = total_fine_amount_due +
                        total_fine_amount_paid +
                        total_fine_amount_dismissed)

unhoused_fined |> glimpse()
Rows: 31,391
Columns: 55
$ citation_number               <chr> "N40222481", "N40222497", "M81024487", "…
$ case_number                   <chr> "N40222481-01 NT 2025", "N40222497-01 NT…
$ offense_date                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ defendant_name                <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ refined_name                  <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ defendant_address             <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIG…
$ phone_number                  <dbl> NA, NA, 3468224952, NA, 3468224952, NA, …
$ offense_location              <chr> "400 MCKINNEY", "400 MCKINNEY", "1200 WH…
$ violation_code                <chr> "CC940", "CC940", "CC940", "CC940", "CC9…
$ violation_description         <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS…
$ violation_description_refined <chr> "SIT/LIE DOWN ON (BLANKET/STOOL) PLACED …
$ total_fine_amount_due         <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0,…
$ total_fine_amount_paid        <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0…
$ total_fine_amount_dismissed   <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, …
$ judgment_date                 <date> 2025-12-08, 2025-12-08, 2026-01-08, 202…
$ time_served                   <chr> "NO", "NO", "YES", "NO", "YES", "NO", "N…
$ judgment_refined              <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ judgment                      <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ disposition_refined           <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ disposition                   <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ data_origin                   <chr> "UNHOUSED_5", "UNHOUSED_5", "UNHOUSED_5"…
$ full_address                  <chr> "400 MCKINNEY, HOUSTON, TX", "400 MCKINN…
$ geocodio_latitude             <dbl> 29.76007, 29.76007, 29.73310, 29.73422, …
$ geocodio_longitude            <dbl> -95.37008, -95.37008, -95.38127, -95.302…
$ geocodio_accuracy_score       <dbl> 0.92, 0.92, 1.00, 1.00, 1.00, 1.00, 1.00…
$ geocodio_accuracy_type        <chr> "RANGE_INTERPOLATION", "RANGE_INTERPOLAT…
$ geocodio_address_line_1       <chr> "400 MCKINNEY ST", "400 MCKINNEY ST", "1…
$ geocodio_address_line_2       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_address_line_3       <chr> "HOUSTON, TX 77002", "HOUSTON, TX 77002"…
$ geocodio_house_number         <dbl> 400, 400, 1200, 7000, 1200, 2300, 2111, …
$ geocodio_street               <chr> "MCKINNEY ST", "MCKINNEY ST", "WHEELER S…
$ geocodio_unit_type            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_unit_number          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_city                 <chr> "HOUSTON", "HOUSTON", "HOUSTON", "HOUSTO…
$ geocodio_state                <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX"…
$ geocodio_county               <chr> "HARRIS COUNTY", "HARRIS COUNTY", "HARRI…
$ geocodio_postal_code          <dbl> 77002, 77002, 77004, 77011, 77004, 77020…
$ geocodio_country              <chr> "US", "US", "US", "US", "US", "US", "US"…
$ geocodio_source               <chr> "TIGER/LINE® FROM THE US CENSUS BUREAU",…
$ off_yr                        <dbl> 2025, 2025, 2025, 2025, 2025, 2025, 2025…
$ off_mo                        <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ off_mo_label                  <ord> Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, …
$ off_yday                      <dbl> 274, 274, 274, 274, 274, 274, 275, 275, …
$ off_floor_mo                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ off_floor_yr                  <date> 2025-01-01, 2025-01-01, 2025-01-01, 202…
$ jud_yr                        <dbl> 2025, 2025, 2026, 2025, 2026, 2025, 2025…
$ jud_mo                        <dbl> 12, 12, 1, 11, 1, 10, 12, 11, 11, 11, 11…
$ jud_mo_label                  <ord> Dec, Dec, Jan, Nov, Jan, Oct, Dec, Nov, …
$ jud_yday                      <dbl> 342, 342, 8, 329, 8, 275, 339, 317, 317,…
$ jud_floor_mo                  <date> 2025-12-01, 2025-12-01, 2026-01-01, 202…
$ jud_floor_yr                  <date> 2025-01-01, 2025-01-01, 2026-01-01, 202…
$ whitmire_offense              <chr> "AFTER WHITMIRE", "AFTER WHITMIRE", "AFT…
$ whitmire_judgment             <chr> "AFTER WHITMIRE", "AFTER WHITMIRE", "AFT…
$ ordinance_type                <chr> "CIVILITY", "CIVILITY", "CIVILITY", "CIV…
$ amount_fined                  <dbl> 252.2, 265.2, 210.6, 194.0, 210.6, 199.0…

7.4 Renaming and selecting columns

Making the final clean object.

unhoused_renamed <- unhoused_fined |> 
  select(!c(offense_location)) |> # this column was reworked into full_address, which was used to geocode. offense_location is no longer needed because full_offense_location replaces it
  rename(fines_due = total_fine_amount_due  ,
         fines_paid = total_fine_amount_paid,
         fines_dismissed = total_fine_amount_dismissed,
         full_offense_location = full_address)

unhoused_renamed |> glimpse()
Rows: 31,391
Columns: 54
$ citation_number               <chr> "N40222481", "N40222497", "M81024487", "…
$ case_number                   <chr> "N40222481-01 NT 2025", "N40222497-01 NT…
$ offense_date                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ defendant_name                <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ refined_name                  <chr> "EUCEDA, BAYFON", "VELEZ RIASCOS, HECTOR…
$ defendant_address             <chr> "HOMELESS  ,", "HOMELESS  ,", "6919 TRIG…
$ phone_number                  <dbl> NA, NA, 3468224952, NA, 3468224952, NA, …
$ violation_code                <chr> "CC940", "CC940", "CC940", "CC940", "CC9…
$ violation_description         <chr> "(CIVILITY ORDINANCE) 0700 TO 2300 HOURS…
$ violation_description_refined <chr> "SIT/LIE DOWN ON (BLANKET/STOOL) PLACED …
$ fines_due                     <dbl> 252.2, 265.2, 0.0, 0.0, 0.0, 0.0, 194.0,…
$ fines_paid                    <dbl> 0.0, 0.0, 210.6, 0.0, 210.6, 0.0, 0.0, 0…
$ fines_dismissed               <dbl> 0, 0, 0, 194, 0, 199, 0, 0, 0, 0, 0, 0, …
$ judgment_date                 <date> 2025-12-08, 2025-12-08, 2026-01-08, 202…
$ time_served                   <chr> "NO", "NO", "YES", "NO", "YES", "NO", "N…
$ judgment_refined              <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ judgment                      <chr> "FAILURE TO APPEAR WARRANT (FTA)", "FAIL…
$ disposition_refined           <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ disposition                   <chr> "UNDISPOSED", "UNDISPOSED", "GUILTY TRIA…
$ data_origin                   <chr> "UNHOUSED_5", "UNHOUSED_5", "UNHOUSED_5"…
$ full_offense_location         <chr> "400 MCKINNEY, HOUSTON, TX", "400 MCKINN…
$ geocodio_latitude             <dbl> 29.76007, 29.76007, 29.73310, 29.73422, …
$ geocodio_longitude            <dbl> -95.37008, -95.37008, -95.38127, -95.302…
$ geocodio_accuracy_score       <dbl> 0.92, 0.92, 1.00, 1.00, 1.00, 1.00, 1.00…
$ geocodio_accuracy_type        <chr> "RANGE_INTERPOLATION", "RANGE_INTERPOLAT…
$ geocodio_address_line_1       <chr> "400 MCKINNEY ST", "400 MCKINNEY ST", "1…
$ geocodio_address_line_2       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_address_line_3       <chr> "HOUSTON, TX 77002", "HOUSTON, TX 77002"…
$ geocodio_house_number         <dbl> 400, 400, 1200, 7000, 1200, 2300, 2111, …
$ geocodio_street               <chr> "MCKINNEY ST", "MCKINNEY ST", "WHEELER S…
$ geocodio_unit_type            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_unit_number          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ geocodio_city                 <chr> "HOUSTON", "HOUSTON", "HOUSTON", "HOUSTO…
$ geocodio_state                <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX"…
$ geocodio_county               <chr> "HARRIS COUNTY", "HARRIS COUNTY", "HARRI…
$ geocodio_postal_code          <dbl> 77002, 77002, 77004, 77011, 77004, 77020…
$ geocodio_country              <chr> "US", "US", "US", "US", "US", "US", "US"…
$ geocodio_source               <chr> "TIGER/LINE® FROM THE US CENSUS BUREAU",…
$ off_yr                        <dbl> 2025, 2025, 2025, 2025, 2025, 2025, 2025…
$ off_mo                        <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, …
$ off_mo_label                  <ord> Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, …
$ off_yday                      <dbl> 274, 274, 274, 274, 274, 274, 275, 275, …
$ off_floor_mo                  <date> 2025-10-01, 2025-10-01, 2025-10-01, 202…
$ off_floor_yr                  <date> 2025-01-01, 2025-01-01, 2025-01-01, 202…
$ jud_yr                        <dbl> 2025, 2025, 2026, 2025, 2026, 2025, 2025…
$ jud_mo                        <dbl> 12, 12, 1, 11, 1, 10, 12, 11, 11, 11, 11…
$ jud_mo_label                  <ord> Dec, Dec, Jan, Nov, Jan, Oct, Dec, Nov, …
$ jud_yday                      <dbl> 342, 342, 8, 329, 8, 275, 339, 317, 317,…
$ jud_floor_mo                  <date> 2025-12-01, 2025-12-01, 2026-01-01, 202…
$ jud_floor_yr                  <date> 2025-01-01, 2025-01-01, 2026-01-01, 202…
$ whitmire_offense              <chr> "AFTER WHITMIRE", "AFTER WHITMIRE", "AFT…
$ whitmire_judgment             <chr> "AFTER WHITMIRE", "AFTER WHITMIRE", "AFT…
$ ordinance_type                <chr> "CIVILITY", "CIVILITY", "CIVILITY", "CIV…
$ amount_fined                  <dbl> 252.2, 265.2, 210.6, 194.0, 210.6, 199.0…

8. Exporting

Exporting entirely cleaned data for analysis.

unhoused_clean <- unhoused_renamed |> 
  write_csv("data-processed/unhoused-clean.csv")