library(tidyverse)
library(janitor)
library(rvest)
Longhorn Soccer Stats
The goal here is to scrape individual and goalkeeper stats from the University of Texas women’s soccer team web pages. The data is stored in tables on the “Box Score” page for each game. The data is then cleaned and exported as RDS files.
Those files will have to be imported and combined, and then cleaned up before analysis.
Of note
This creates two files for each game (player stats and goalkeeper stats) and exports them as rds files. The idea is those files could be combined in a later script and further cleaning done.
This could be refactored to pull all the URLs from a schedule page and then run through a Github action to collect the data each week.
What we want
A list of the information we are trying to gather from each page.
- The date of the game
- The location of the game
- The visiting team
- The home team
- Individual stats for visitors
- Goalkeeping stats for visitors
- Individual stats for home team
- Goalkeeping stats for home team
Some pages to test
- Houston vs Texas is a 2024 home game.
- Texas vs SMU is a 2024 road game.
- Texas vs Florida St. is a 2023 post-season match.
- Incarnate Word vs Texas is a game from a previous year. It also doesn’t have “Cautions and Ejections” which revealed I needed a better way to pick out the correct tables.
- Iowa vs Texas also revealed I need to be more specific about where to find visiting/home team names.
Setup
Scrape stats function
Once I figured out how to do all this, I turned the process into a function that can be fed a list of URLs to make files for each match.
This combines all the processing worked out below and turns it into a function so we can feed a list of URLs to make all the files.
<- function(file_url) {
scrape_stats
# A pause to avoid hammering servers
Sys.sleep(2)
# reads in URL
<- read_html(file_url)
game_stats_raw
# pulls game details from description list
<- game_stats_raw |> html_nodes("dd") |> html_text()
m_details
# creates variables for game details
<- m_details[[1]] |> mdy()
m_date <- m_details[[3]]
m_site <- m_details[[4]]
m_stadium
# pulls who is visitor vs home
<- game_stats_raw |>
subheads html_nodes("#individual-stats") |>
html_nodes(".sub-heading") |>
html_text()
# UPDATES START HERE
# Separates subheads into home/away team_score
<- subheads[[1]]
v_team_score <- subheads[[2]]
h_team_score
# Removes ranks and scores to get team name
<- v_team_score[[1]] |> str_remove(" \\d+$") |> str_remove("^#\\d+ ")
v_team <- h_team_score[[1]] |> str_remove(" \\d+$") |> str_remove("^#\\d+ ")
h_team
# extracts score from team_score
<- v_team_score |> str_extract("\\d+$")
v_score <- h_team_score |> str_extract("\\d+$")
h_score
# UPDATES END HERE
# A function to handle some player stats cleaning
<- function(df) {
clean_indi_stats |> mutate(
df start = case_when(
== "Starters" ~ "Starter",
player == "Substitutes" ~ "Substitute",
player == "Totals" ~ "Total",
player .default = NA
.after = pos,
), |> fill(start) |>
) filter(!player %in% c("Starters", "Substitutes", "Totals")) |>
mutate(
date = m_date,
site = m_site,
)
}
# creates list of individual stats tables
<- game_stats_raw |> html_elements("section#individual-stats") |> minimal_html() |> html_table()
game_stats_tables
# gets each table type depending on visitor/home
<- game_stats_tables[[1]] |> clean_names()
v_pl_raw <- game_stats_tables[[2]] |> clean_names()
v_gk_raw <- game_stats_tables[[3]] |> clean_names()
h_pl_raw <- game_stats_tables[[4]] |> clean_names()
h_gk_raw
# adds team names to individual stats
<- v_pl_raw |>
v_pl clean_indi_stats() |>
mutate(team = v_team, vs = h_team)
<- h_pl_raw |>
h_pl clean_indi_stats() |>
mutate(team = h_team, vs = v_team)
# combines visitor/home data into one table
<- bind_rows(v_pl, h_pl)
pl
# adds game details and team names to goalkeeping stats: visitors
<- v_gk_raw |>
v_gk filter(!goalie %in% c("Goalkeeping", "Totals")) |>
mutate(team = v_team, vs = h_team, date = m_date, site = m_site)
# adds game details and team names to goalkeeping stats: home
<- h_gk_raw |>
h_gk filter(!goalie %in% c("Goalkeeping", "Totals")) |>
mutate(team = h_team, vs = v_team, date = m_date, site = m_site)
# combines goalkeeper stats
<- bind_rows(v_gk, h_gk)
gk
# preps file names for export based on matchup
<- paste(m_date, v_team, h_team, sep = "_") |> str_replace_all(" ", "_") |> str_replace_all("/", "-")
file_name_prefix
<- paste("data-raw/soccer/", "pl_", file_name_prefix, ".rds", sep = "")
pl_export_path <- paste("data-raw/soccer/", "gk_", file_name_prefix, ".rds", sep = "")
gk_export_path
# exports files
|> write_rds(pl_export_path)
pl |> write_rds(gk_export_path)
gk }
Testing the scrape
This concept could be refactored to use a schedule page to get all the URLs to the box score pages, but I don’t have that in me right now.
I first build a list of URLs to test the function. I’m building a couple of versions for convenience and to test some specific things.
A single page
<- "https://texaslonghorns.com/sports/womens-soccer/stats/2024/texas-am/boxscore/16315" url_one
This is a small list
<- c(
url_list_short "https://texaslonghorns.com/sports/womens-soccer/stats/2024/houston/boxscore/16304",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/iowa/boxscore/16307",
"https://texaslonghorns.com/sports/womens-soccer/stats/2015/north-carolina/boxscore/8819",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/texas-am/boxscore/16315"
)
This is a longer list of the entire 2024 season:
<- c(
url_list_2024 "https://texaslonghorns.com/sports/womens-soccer/stats/2024/houston/boxscore/16304",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/cal-state-bakersfield/boxscore/16305",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/smu/boxscore/16306",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/iowa/boxscore/16307",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/central-michigan/boxscore/16308",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/san-diego-state/boxscore/16309",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/long-beach-state/boxscore/16310",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/houston-christian/boxscore/16311",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/alabama/boxscore/16312",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/mississippi-state/boxscore/16313",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/ole-miss/boxscore/16314",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/texas-am/boxscore/16315",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/florida/boxscore/16316",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/oklahoma/boxscore/16317",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/lsu/boxscore/16318",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/arkansas/boxscore/16319",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/missouri/boxscore/16320",
"https://texaslonghorns.com/sports/womens-soccer/stats/2024/georgia/boxscore/16321"
)
Process all the files
This uses map to process all the files.
|> map(scrape_stats) url_one
[[1]]
# A tibble: 2 × 10
position number goalie minutes ga saves team vs date site
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <date> <chr>
1 GK 0 Fuller, Sydn… 90:00 2 4 Texa… Texas 2024-09-29 Aust…
2 GK 1 Justus, Mia 90:00 0 5 Texas Texa… 2024-09-29 Aust…
Working out details
This is the code I used to work out the process function above.
Get the raw HTML
<- read_html("https://texaslonghorns.com/sports/womens-soccer/stats/2015/north-carolina/boxscore/8819") game_stats_raw
Match details
We pull these from a description list on the main “Box Score” page.
<- game_stats_raw |> html_nodes("dd") |> html_text()
m_details
<- m_details[[1]] |> mdy()
m_date <- m_details[[3]]
m_site <- m_details[[4]] m_stadium
Teams and scores
We get these from the subheads on the “Individual” page. I had to target these pretty specifically by id and class to get the correct tables.
<- game_stats_raw |> html_nodes("#individual-stats") |> html_nodes(".sub-heading") |> html_text()
subheads
<- subheads[[1]]
v_team_score <- subheads[[2]]
h_team_score
<- v_team_score[[1]] |> str_remove(" \\d+$") |> str_remove("^#\\d+ ")
v_team <- h_team_score[[1]] |> str_remove(" \\d+$") |> str_remove("^#\\d+ ")
h_team
<- v_team_score |> str_extract("\\d+$")
v_score <- h_team_score |> str_extract("\\d+$")
h_score
paste("Visiting team was", v_team, "and their score was", v_score)
[1] "Visiting team was North Carolina and their score was 2"
paste("Home team was", h_team, "and their score was", h_score)
[1] "Home team was Texas and their score was 0"
Individual stats
There are four separate tables we need that are displayed on the “Individual” tab.
Get the tables
<- game_stats_raw |>
game_stats_tables html_elements("section#individual-stats")|>
minimal_html() |> html_table()
game_stats_tables
[[1]]
# A tibble: 23 × 8
Pos `#` Player SH SOG G A MIN
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Starters Starters "Starters" Star… Star… Star… Star… Star…
2 D 5 "5 \r\n … 0 0 0 0 90
3 F 6 "6 \r\n … 5 1 0 1 61
4 M 10 "10 \r\n … 4 1 1 0 61
5 M 11 "11 \r\n … 0 0 0 0 66
6 F 12 "12 \r\n … 4 2 0 0 62
7 M 15 "15 \r\n … 0 0 0 0 66
8 D 16 "16 \r\n … 1 0 0 0 90
9 M 21 "21 \r\n … 1 1 1 0 66
10 GK 23 "23 \r\n … 0 0 0 0 45
# ℹ 13 more rows
[[2]]
# A tibble: 4 × 6
Position `#` Goalie Minutes GA Saves
<chr> <chr> <chr> <chr> <chr> <chr>
1 "Goalkeeping" "Goalkeeping" Goalkeeping "Goalkeeping" Goalkeeping Goalke…
2 "GK" "1" Bryane Heaberlin "45:00" 0 2
3 "GK" "23" Lindsey Harris "45:00" 0 0
4 "" "" Totals "" 0 2
[[3]]
# A tibble: 23 × 8
Pos `#` Player SH SOG G A MIN
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Starters Starters "Starters" Star… Star… Star… Star… Star…
2 GK 1 "1 \r\n … 1 1 0 0 90
3 M 4 "4 \r\n … 1 1 0 0 90
4 M 5 "5 \r\n … 0 0 0 0 85
5 D 12 "12 \r\n … 0 0 0 0 90
6 D 13 "13 \r\n … 0 0 0 0 90
7 D 14 "14 \r\n … 0 0 0 0 90
8 F 17 "17 \r\n … 2 0 0 0 66
9 F 20 "20 \r\n … 1 0 0 0 89
10 F 23 "23 \r\n … 1 0 0 0 68
# ℹ 13 more rows
[[4]]
# A tibble: 3 × 6
Position `#` Goalie Minutes GA Saves
<chr> <chr> <chr> <chr> <chr> <chr>
1 "Goalkeeping" "Goalkeeping" Goalkeeping "Goalkeeping" Goalkeeping Goalkeeping
2 "GK" "1" Smith, Abby "90:00" 2 7
3 "" "" Totals "" 2 7
Saves specific tables we need
<- game_stats_tables[[1]] |> clean_names()
v_pl_raw <- game_stats_tables[[2]] |> clean_names()
v_gk_raw <- game_stats_tables[[3]] |> clean_names()
h_pl_raw <- game_stats_tables[[4]] |> clean_names() h_gk_raw
Clean individual stats
A function to clean stats …
- Adds a column to indicate starters vs subs
- Removes totals and other headers
# A function to handle some player stats cleaning
<- function(df) {
clean_indi_stats |> mutate(
df start = case_when(
== "Starters" ~ "Starter",
player == "Substitutes" ~ "Substitute",
player == "Totals" ~ "Total",
player .default = NA
.after = pos,
), |> fill(start) |>
) filter(!player %in% c("Starters", "Substitutes", "Totals")) |>
mutate(
date = m_date,
site = m_site,
) }
This takes the individual stats and runs them through the cleaning function above and then adds home and away team values.
<- v_pl_raw |>
v_pl clean_indi_stats() |>
mutate(team = v_team, vs = h_team)
v_pl
<- h_pl_raw |>
h_pl clean_indi_stats() |>
mutate(team = h_team, vs = v_team)
h_pl
Combine individual stats
Combines the visitor/home two player stats
<- bind_rows(v_pl, h_pl)
pl
pl
Clean goalkeeper stats
This is the same process as above, but for goalkeepers. It’s a little less complicated.
<- v_gk_raw |>
v_gk filter(!goalie %in% c("Goalkeeping", "Totals")) |>
mutate(team = v_team, vs = h_team, date = m_date, site = m_site)
<- h_gk_raw |>
h_gk filter(!goalie %in% c("Goalkeeping", "Totals")) |>
mutate(team = h_team, vs = v_team, date = m_date, site = m_site)
v_gk
h_gk
Combine goalkeepers
<- bind_rows(v_gk, h_gk)
gk
gk
Writing the exports
The actual exports are commented out so we don’t overwrite what came from above.
<- paste(m_date, v_team, h_team, sep = "_") |> str_replace_all(" ", "_") |> str_replace_all("/", "-")
file_name_prefix
<- paste("data-raw/soccer/", "pl_", file_name_prefix, ".rds", sep = "")
pl_export_path <- paste("data-raw/soccer/", "gk_", file_name_prefix, ".rds", sep = "")
gk_export_path
pl_export_path
[1] "data-raw/soccer/pl_2015-08-28_North_Carolina_Texas.rds"
gk_export_path
[1] "data-raw/soccer/gk_2015-08-28_North_Carolina_Texas.rds"
## Commented so it doesn't write anything out
# pl |> write_rds(pl_export_path)
# gk |> write_rds(gk_export_path)