Web Scraping to grab the top 100 world road cyclist for:
Build the Cycling Dream Team
Betting
Sentimental Purpose (on specific cyclist or team in general)
Checking /Robots.txt to assure that we are fine to do web scraping.
# package for data wrangling
library(dplyr)
library(tidyr)
library(readxl)
library(purrr)
# package for string manipulation
library(stringr)
library(stringi)
library(qdapRegex)
# package for web scraping
library(rvest)
library(RSelenium)
library(httr)
# package for data exploratory
library(ggplot2)
There are two kind of data that needs to be scrapped.
Cyclist Personal Data
Cyclist Points
Both are come from two different links, so i will do scraping on two list of links
link_stats <- "https://www.procyclingstats.com/rankings.php"
page_list <- read_html(link_stats)
#Cyclist Stats
cyclist_link <- html_nodes(page_list, css = ".fs10+ td a") %>%
html_attr("href")
cyclist_link_2 <- data.frame(cyclist_link) %>%
mutate(link = paste("https://www.procyclingstats.com/",cyclist_link,sep = ""))
head(cyclist_link_2)
#> cyclist_link
#> 1 rider/primoz-roglic
#> 2 rider/tadej-pogacar
#> 3 rider/wout-van-aert
#> 4 rider/julian-alaphilippe
#> 5 rider/alejandro-valverde
#> 6 rider/sam-bennett
#> link
#> 1 https://www.procyclingstats.com/rider/primoz-roglic
#> 2 https://www.procyclingstats.com/rider/tadej-pogacar
#> 3 https://www.procyclingstats.com/rider/wout-van-aert
#> 4 https://www.procyclingstats.com/rider/julian-alaphilippe
#> 5 https://www.procyclingstats.com/rider/alejandro-valverde
#> 6 https://www.procyclingstats.com/rider/sam-bennett
cyclist_link_2$link[5]
#> [1] "https://www.procyclingstats.com/rider/alejandro-valverde"
#Points Stats
points_link <- html_nodes(page_list, css = ".cu600+ td a") %>%
html_attr("href")
points_link_2 <- data.frame(points_link) %>%
mutate(link = paste("https://www.procyclingstats.com/",points_link,sep = ""))
head(points_link_2)
#> points_link
#> 1 rider.php?date=2021-05-10&id=174582&p=results&s=pcs-ranking
#> 2 rider.php?date=2021-05-10&id=194619&p=results&s=pcs-ranking
#> 3 rider.php?date=2021-05-10&id=168961&p=results&s=pcs-ranking
#> 4 rider.php?date=2021-05-10&id=137427&p=results&s=pcs-ranking
#> 5 rider.php?date=2021-05-10&id=140924&p=results&s=pcs-ranking
#> 6 rider.php?date=2021-05-10&id=139405&p=results&s=pcs-ranking
#> link
#> 1 https://www.procyclingstats.com/rider.php?date=2021-05-10&id=174582&p=results&s=pcs-ranking
#> 2 https://www.procyclingstats.com/rider.php?date=2021-05-10&id=194619&p=results&s=pcs-ranking
#> 3 https://www.procyclingstats.com/rider.php?date=2021-05-10&id=168961&p=results&s=pcs-ranking
#> 4 https://www.procyclingstats.com/rider.php?date=2021-05-10&id=137427&p=results&s=pcs-ranking
#> 5 https://www.procyclingstats.com/rider.php?date=2021-05-10&id=140924&p=results&s=pcs-ranking
#> 6 https://www.procyclingstats.com/rider.php?date=2021-05-10&id=139405&p=results&s=pcs-ranking
points_link_2$link[5]
#> [1] "https://www.procyclingstats.com/rider.php?date=2021-05-10&id=140924&p=results&s=pcs-ranking"
cyclist_link_2$link[2]
#> [1] "https://www.procyclingstats.com/rider/tadej-pogacar"
We’ve finished gathering all 100 top rated world cyclist from procyclingstats. Next, lets try to gather information from every link. We will scrap Name, Team, DOB, Nationality, Weight, Height. Points will be collected later.
First lets save the main html in an object
cyclist_1 <- read_html(cyclist_link_2$link[1])
points_1 <- read_html(points_link_2$link[1])
name <- html_node(cyclist_1,css = "h1") %>%
html_text2()
name
#> [1] "Primož Roglic"
team <- html_node(cyclist_1,css = "span.red.showIfMobile") %>%
html_text2()
team
#> [1] "Team Jumbo-Visma"
dob <- html_nodes(cyclist_1,css = ".rdr-info-cont") %>%
html_text2() %>%
as.data.frame() %>%
setNames("DateofBirth") %>%
mutate(DateofBirth = unlist(qdapRegex::ex_between(text.var = DateofBirth,
left = "Date of birth: ", right = "("))) %>%
mutate(DateofBirth = gsub("th|st|nd|rd","", DateofBirth))
dob
#> DateofBirth
#> 1 29 October 1989
nationality <- html_nodes(cyclist_1, css = ".rdr-info-cont") %>%
html_text2() %>%
as.data.frame() %>%
setNames("Nationality") %>%
mutate(Nationality = gsub("\nWeight.*", "", Nationality)) %>% #Dont know cant use qdap regex
mutate(Nationality = gsub(".*Nationality:","", Nationality)) #Dont know cant use qdap regex
nationality
#> Nationality
#> 1 Slovenia
weight <- html_nodes(cyclist_1, css = ".rdr-info-cont") %>%
html_text2() %>%
as.data.frame() %>%
setNames("Weight") %>%
mutate(Weight = unlist(qdapRegex::ex_between(Weight, left = "Weight:",right ="kg")))
weight
#> Weight
#> 1 65
height <- html_nodes(cyclist_1, css = ".rdr-info-cont") %>%
html_text2() %>%
as.data.frame() %>%
setNames("Height") %>%
mutate(Height = unlist(qdapRegex::ex_between(Height, left = "Height:",right ="mPl")))
height
#> Height
#> 1 1.77
points <- html_nodes(points_1, css = ".sum td:nth-child(4)") %>%
html_text2()
points
#> [1] "2928"
info_gather <- function(link){
#perintah
html <- read_html(link)
#name
name <- html_node(html,css = "h1") %>%
html_text2()
#team
team <- html_node(html,css = "span.red.showIfMobile") %>%
html_text2()
#DOB
dob <- html_nodes(html,css = ".rdr-info-cont") %>%
html_text2() %>%
as.data.frame() %>%
setNames("DateofBirth") %>%
mutate(DateofBirth = unlist(qdapRegex::ex_between(text.var = DateofBirth,
left = "Date of birth: ", right = "("))) %>%
mutate(DateofBirth = gsub("th|st|nd|rd","", DateofBirth))
#nationality
nationality <- html_nodes(html, css = ".rdr-info-cont") %>%
html_text2() %>%
as.data.frame() %>%
setNames("Nationality") %>%
mutate(Nationality = gsub("\nWeight.*", "", Nationality)) %>%
mutate(Nationality = gsub(".*Nationality:","", Nationality))
#weight
weight <- html_nodes(html, css = ".rdr-info-cont") %>%
html_text2() %>%
as.data.frame() %>%
setNames("Weight") %>%
mutate(Weight = unlist(qdapRegex::ex_between(Weight, left = "Weight:",right ="kg")))
#height
height <- html_nodes(html, css = ".rdr-info-cont") %>%
html_text2() %>%
as.data.frame() %>%
setNames("Height") %>%
mutate(Height = unlist(qdapRegex::ex_between(Height, left = "Height:",right ="mPl")))
cyclist_info = data.frame(Name = name, Team = team, DOB = dob, Nationality = nationality, Weight = weight, Height = height)
return(cyclist_info)
}
info_gather(link = cyclist_link_2$link[2])
#> Name Team DateofBirth Nationality Weight Height
#> 1 Tadej Pogacar UAE-Team Emirates 21 September 1998 Slovenia 66 1.76
cyclist_link_3 <- data.frame()
for(i in 1:nrow(cyclist_link_2)){
message("Gather cyclist info #",i,"/",nrow(cyclist_link_2))
page <- info_gather(cyclist_link_2$link[i])
cyclist_link_3 <- rbind(cyclist_link_3,page)
}
head(cyclist_link_3)
#> Name Team DateofBirth Nationality
#> 1 Primož Roglic Team Jumbo-Visma 29 October 1989 Slovenia
#> 2 Tadej Pogacar UAE-Team Emirates 21 September 1998 Slovenia
#> 3 Wout van Aert Team Jumbo-Visma 15 September 1994 Belgium
#> 4 Julian Alaphilippe Deceuninck - Quick Step 11 June 1992 France
#> 5 Alejandro Valverde Movistar Team 25 April 1980 Spain
#> 6 Sam Bennett Deceuninck - Quick Step 16 October 1990 Ireland
#> Weight Height
#> 1 65 1.77
#> 2 66 1.76
#> 3 78 1.9
#> 4 62 1.73
#> 5 61 1.77
#> 6 73 1.78
info_gather2 <- function(link){
#perintah
html2 <- read_html(link)
points <- html_nodes(html2, css = ".sum td:nth-child(4)") %>%
html_text2()
return(points)
}
info_gather2(link = points_link_2$link[5])
#> [1] "1434"
cyclist_link_4 <- data.frame()
for(i in 1:nrow(points_link_2)){
message("Gather cyclist points #",i,"/",nrow(points_link_2))
page2 <- info_gather2(points_link_2$link[i])
cyclist_link_4 <- rbind(cyclist_link_4,page2)
}
head(cyclist_link_4)
#> X.2928.
#> 1 2928
#> 2 2515
#> 3 2297
#> 4 1445
#> 5 1434
#> 6 1383
Fixing some of column type too
ToFactor <- c("Name", "Team", "Nationality")
ToNumber <- c("Weight","Height","Points")
cyclist_data <- cbind(cyclist_link_3, cyclist_link_4)
cyclist_data <- cyclist_data %>%
rename(Points = X.2928.) %>%
mutate(across(ToFactor, as.factor),
across(ToNumber, as.numeric))
head(cyclist_data)
#> Name Team DateofBirth Nationality
#> 1 Primož Roglic Team Jumbo-Visma 29 October 1989 Slovenia
#> 2 Tadej Pogacar UAE-Team Emirates 21 September 1998 Slovenia
#> 3 Wout van Aert Team Jumbo-Visma 15 September 1994 Belgium
#> 4 Julian Alaphilippe Deceuninck - Quick Step 11 June 1992 France
#> 5 Alejandro Valverde Movistar Team 25 April 1980 Spain
#> 6 Sam Bennett Deceuninck - Quick Step 16 October 1990 Ireland
#> Weight Height Points
#> 1 65 1.77 2928
#> 2 66 1.76 2515
#> 3 78 1.90 2297
#> 4 62 1.73 1445
#> 5 61 1.77 1434
#> 6 73 1.78 1383
write.csv(cyclist_data, "data_input/cyclist.csv", row.names = TRUE)
Reread cyclist data from .csv file to save processing time.
cyclist <- read.csv("data_input/cyclist.csv", as.is = 1)
cyclist <- cyclist %>%
select(-X) %>%
mutate(DateofBirth = lubridate::dmy(DateofBirth))
head(cyclist)
#> Name Team DateofBirth Nationality Weight
#> 1 Primož Roglic Team Jumbo-Visma 1989-10-29 Slovenia 65
#> 2 Tadej Pogacar UAE-Team Emirates 1998-09-21 Slovenia 66
#> 3 Wout van Aert Team Jumbo-Visma 1994-09-15 Belgium 78
#> 4 Julian Alaphilippe Deceuninck - Quick Step 1992-06-11 France 62
#> 5 Alejandro Valverde Movistar Team 1980-04-25 Spain 61
#> 6 Sam Bennett Deceuninck - Quick Step 1990-10-16 Ireland 73
#> Height Points
#> 1 1.77 2928
#> 2 1.76 2515
#> 3 1.90 2297
#> 4 1.73 1445
#> 5 1.77 1434
#> 6 1.78 1383
After we have successfully collected all the data, we can go to the next step in explore the data itself.