Web Scraping to grab the top 100 world road cyclist for:

Build the Cycling Dream Team
Betting
Sentimental Purpose (on specific cyclist or team in general)

Checking /Robots.txt to assure that we are fine to do web scraping.

Library load

# package for data wrangling
library(dplyr)
library(tidyr)
library(readxl)
library(purrr)

# package for string manipulation
library(stringr)
library(stringi)
library(qdapRegex)

# package for web scraping
library(rvest)
library(RSelenium)
library(httr)

# package for data exploratory
library(ggplot2)

Scrap list of links

There are two kind of data that needs to be scrapped.

Cyclist Personal Data
Cyclist Points

Both are come from two different links, so i will do scraping on two list of links

link_stats <- "https://www.procyclingstats.com/rankings.php"
page_list <- read_html(link_stats)

#Cyclist Stats
cyclist_link <- html_nodes(page_list, css = ".fs10+ td a") %>% 
  html_attr("href") 

cyclist_link_2 <- data.frame(cyclist_link) %>% 
  mutate(link = paste("https://www.procyclingstats.com/",cyclist_link,sep = ""))

head(cyclist_link_2)

#>               cyclist_link
#> 1      rider/primoz-roglic
#> 2      rider/tadej-pogacar
#> 3      rider/wout-van-aert
#> 4 rider/julian-alaphilippe
#> 5 rider/alejandro-valverde
#> 6        rider/sam-bennett
#>                                                       link
#> 1      https://www.procyclingstats.com/rider/primoz-roglic
#> 2      https://www.procyclingstats.com/rider/tadej-pogacar
#> 3      https://www.procyclingstats.com/rider/wout-van-aert
#> 4 https://www.procyclingstats.com/rider/julian-alaphilippe
#> 5 https://www.procyclingstats.com/rider/alejandro-valverde
#> 6        https://www.procyclingstats.com/rider/sam-bennett

cyclist_link_2$link[5]

#> [1] "https://www.procyclingstats.com/rider/alejandro-valverde"

#Points Stats
points_link <- html_nodes(page_list, css = ".cu600+ td a") %>% 
  html_attr("href")

points_link_2 <- data.frame(points_link) %>% 
  mutate(link = paste("https://www.procyclingstats.com/",points_link,sep = ""))

head(points_link_2)

#>                                                   points_link
#> 1 rider.php?date=2021-05-10&id=174582&p=results&s=pcs-ranking
#> 2 rider.php?date=2021-05-10&id=194619&p=results&s=pcs-ranking
#> 3 rider.php?date=2021-05-10&id=168961&p=results&s=pcs-ranking
#> 4 rider.php?date=2021-05-10&id=137427&p=results&s=pcs-ranking
#> 5 rider.php?date=2021-05-10&id=140924&p=results&s=pcs-ranking
#> 6 rider.php?date=2021-05-10&id=139405&p=results&s=pcs-ranking
#>                                                                                          link
#> 1 https://www.procyclingstats.com/rider.php?date=2021-05-10&id=174582&p=results&s=pcs-ranking
#> 2 https://www.procyclingstats.com/rider.php?date=2021-05-10&id=194619&p=results&s=pcs-ranking
#> 3 https://www.procyclingstats.com/rider.php?date=2021-05-10&id=168961&p=results&s=pcs-ranking
#> 4 https://www.procyclingstats.com/rider.php?date=2021-05-10&id=137427&p=results&s=pcs-ranking
#> 5 https://www.procyclingstats.com/rider.php?date=2021-05-10&id=140924&p=results&s=pcs-ranking
#> 6 https://www.procyclingstats.com/rider.php?date=2021-05-10&id=139405&p=results&s=pcs-ranking

points_link_2$link[5]

#> [1] "https://www.procyclingstats.com/rider.php?date=2021-05-10&id=140924&p=results&s=pcs-ranking"

cyclist_link_2$link[2]

#> [1] "https://www.procyclingstats.com/rider/tadej-pogacar"

Gather information

We’ve finished gathering all 100 top rated world cyclist from procyclingstats. Next, lets try to gather information from every link. We will scrap Name, Team, DOB, Nationality, Weight, Height. Points will be collected later.

First lets save the main html in an object

cyclist_1 <- read_html(cyclist_link_2$link[1])
points_1 <- read_html(points_link_2$link[1])

Name

name <- html_node(cyclist_1,css = "h1") %>% 
  html_text2()
name

#> [1] "Primož Roglic"

Team

team <- html_node(cyclist_1,css = "span.red.showIfMobile") %>% 
  html_text2()
team

#> [1] "Team Jumbo-Visma"

Date of Birth (DOB)

dob <- html_nodes(cyclist_1,css = ".rdr-info-cont") %>% 
  html_text2() %>% 
  as.data.frame() %>% 
  setNames("DateofBirth") %>% 
  mutate(DateofBirth = unlist(qdapRegex::ex_between(text.var = DateofBirth,
                                                    left = "Date of birth: ", right = "("))) %>% 
  mutate(DateofBirth = gsub("th|st|nd|rd","", DateofBirth))
dob

#>       DateofBirth
#> 1 29 October 1989

Nationality

nationality <- html_nodes(cyclist_1, css = ".rdr-info-cont") %>% 
  html_text2() %>% 
  as.data.frame() %>% 
  setNames("Nationality") %>% 
  mutate(Nationality = gsub("\nWeight.*", "", Nationality)) %>% #Dont know cant use qdap regex 
  mutate(Nationality = gsub(".*Nationality:","", Nationality)) #Dont know cant use qdap regex
nationality

#>   Nationality
#> 1    Slovenia

Weight

weight <- html_nodes(cyclist_1, css = ".rdr-info-cont") %>% 
  html_text2() %>% 
  as.data.frame() %>% 
  setNames("Weight") %>% 
  mutate(Weight = unlist(qdapRegex::ex_between(Weight, left = "Weight:",right ="kg")))
weight

#>   Weight
#> 1     65

Height

height <- html_nodes(cyclist_1, css = ".rdr-info-cont") %>% 
  html_text2() %>% 
  as.data.frame() %>% 
  setNames("Height") %>% 
  mutate(Height = unlist(qdapRegex::ex_between(Height, left = "Height:",right ="mPl")))
height

#>   Height
#> 1   1.77

Points

points <- html_nodes(points_1, css = ".sum td:nth-child(4)") %>% 
  html_text2()
points

#> [1] "2928"

Partial Assemble

Cyclist Personal Data

info_gather <- function(link){
  #perintah
  html <- read_html(link)
  
  #name
  name <- html_node(html,css = "h1") %>% 
  html_text2()
  
  #team
  team <- html_node(html,css = "span.red.showIfMobile") %>% 
  html_text2()
  
  #DOB
  dob <- html_nodes(html,css = ".rdr-info-cont") %>% 
  html_text2() %>% 
  as.data.frame() %>% 
  setNames("DateofBirth") %>% 
  mutate(DateofBirth = unlist(qdapRegex::ex_between(text.var = DateofBirth,
                                                    left = "Date of birth: ", right = "("))) %>% 
  mutate(DateofBirth = gsub("th|st|nd|rd","", DateofBirth))
  
  #nationality
  nationality <- html_nodes(html, css = ".rdr-info-cont") %>% 
  html_text2() %>% 
  as.data.frame() %>% 
  setNames("Nationality") %>% 
  mutate(Nationality = gsub("\nWeight.*", "", Nationality)) %>%
  mutate(Nationality = gsub(".*Nationality:","", Nationality))
  
  #weight
  weight <- html_nodes(html, css = ".rdr-info-cont") %>% 
  html_text2() %>% 
  as.data.frame() %>% 
  setNames("Weight") %>% 
  mutate(Weight = unlist(qdapRegex::ex_between(Weight, left = "Weight:",right ="kg")))
  
  #height
  height <- html_nodes(html, css = ".rdr-info-cont") %>% 
  html_text2() %>% 
  as.data.frame() %>% 
  setNames("Height") %>% 
  mutate(Height = unlist(qdapRegex::ex_between(Height, left = "Height:",right ="mPl")))
  
  cyclist_info = data.frame(Name = name, Team = team, DOB = dob, Nationality = nationality, Weight = weight, Height = height)
  
  return(cyclist_info)
}

info_gather(link = cyclist_link_2$link[2])

#>            Name              Team       DateofBirth Nationality Weight Height
#> 1 Tadej Pogacar UAE-Team Emirates 21 September 1998    Slovenia     66   1.76

cyclist_link_3 <- data.frame()

for(i in 1:nrow(cyclist_link_2)){
  message("Gather cyclist info #",i,"/",nrow(cyclist_link_2))
  page <- info_gather(cyclist_link_2$link[i])
  cyclist_link_3 <- rbind(cyclist_link_3,page)
}

head(cyclist_link_3)

#>                 Name                    Team       DateofBirth Nationality
#> 1      Primož Roglic        Team Jumbo-Visma   29 October 1989    Slovenia
#> 2      Tadej Pogacar       UAE-Team Emirates 21 September 1998    Slovenia
#> 3      Wout van Aert        Team Jumbo-Visma 15 September 1994     Belgium
#> 4 Julian Alaphilippe Deceuninck - Quick Step      11 June 1992      France
#> 5 Alejandro Valverde           Movistar Team     25 April 1980       Spain
#> 6        Sam Bennett Deceuninck - Quick Step   16 October 1990     Ireland
#>   Weight Height
#> 1     65   1.77
#> 2     66   1.76
#> 3     78    1.9
#> 4     62   1.73
#> 5     61   1.77
#> 6     73   1.78

Points

info_gather2 <- function(link){
  #perintah
  html2 <- read_html(link)
  points <- html_nodes(html2, css = ".sum td:nth-child(4)") %>% 
  html_text2()
  return(points)
}

info_gather2(link = points_link_2$link[5])

#> [1] "1434"

cyclist_link_4 <- data.frame()

for(i in 1:nrow(points_link_2)){
  message("Gather cyclist points #",i,"/",nrow(points_link_2))
  
  page2 <- info_gather2(points_link_2$link[i])
  
  cyclist_link_4 <- rbind(cyclist_link_4,page2)
}

head(cyclist_link_4)

#>   X.2928.
#> 1    2928
#> 2    2515
#> 3    2297
#> 4    1445
#> 5    1434
#> 6    1383

Bind Cyclist + Points

Fixing some of column type too

ToFactor <- c("Name", "Team", "Nationality")
ToNumber <- c("Weight","Height","Points")

cyclist_data <- cbind(cyclist_link_3, cyclist_link_4)
cyclist_data <- cyclist_data %>% 
  rename(Points = X.2928.) %>% 
  mutate(across(ToFactor, as.factor),
         across(ToNumber, as.numeric))
head(cyclist_data)

#>                 Name                    Team       DateofBirth Nationality
#> 1      Primož Roglic        Team Jumbo-Visma   29 October 1989    Slovenia
#> 2      Tadej Pogacar       UAE-Team Emirates 21 September 1998    Slovenia
#> 3      Wout van Aert        Team Jumbo-Visma 15 September 1994     Belgium
#> 4 Julian Alaphilippe Deceuninck - Quick Step      11 June 1992      France
#> 5 Alejandro Valverde           Movistar Team     25 April 1980       Spain
#> 6        Sam Bennett Deceuninck - Quick Step   16 October 1990     Ireland
#>   Weight Height Points
#> 1     65   1.77   2928
#> 2     66   1.76   2515
#> 3     78   1.90   2297
#> 4     62   1.73   1445
#> 5     61   1.77   1434
#> 6     73   1.78   1383

write.csv(cyclist_data, "data_input/cyclist.csv", row.names = TRUE)

Reread cyclist data from .csv file to save processing time.

cyclist <- read.csv("data_input/cyclist.csv", as.is = 1)
cyclist <- cyclist %>% 
  select(-X) %>% 
  mutate(DateofBirth = lubridate::dmy(DateofBirth))
head(cyclist)

#>                 Name                    Team DateofBirth Nationality Weight
#> 1      Primož Roglic        Team Jumbo-Visma  1989-10-29    Slovenia     65
#> 2      Tadej Pogacar       UAE-Team Emirates  1998-09-21    Slovenia     66
#> 3      Wout van Aert        Team Jumbo-Visma  1994-09-15     Belgium     78
#> 4 Julian Alaphilippe Deceuninck - Quick Step  1992-06-11      France     62
#> 5 Alejandro Valverde           Movistar Team  1980-04-25       Spain     61
#> 6        Sam Bennett Deceuninck - Quick Step  1990-10-16     Ireland     73
#>   Height Points
#> 1   1.77   2928
#> 2   1.76   2515
#> 3   1.90   2297
#> 4   1.73   1445
#> 5   1.77   1434
#> 6   1.78   1383

After we have successfully collected all the data, we can go to the next step in explore the data itself.

Web Scraping with R

Ronny G.

21/4/2021

Library load

Scrap list of links

Gather information

Name

Team

Date of Birth (DOB)

Nationality

Weight

Height

Points

Partial Assemble

Cyclist Personal Data

Points

Bind Cyclist + Points