################################### # # # R Lunch - Webscraping # # Support for slides # # roygava.com/webscraping-lunch # # 2020-11-09 # # # ################################### ## --------------------------------- #Load packages library(rvest) library(dplyr) library(tibble) library(stringr) library(lubridate) library(RSelenium) ## --------------------------------- url <- "https://www.bag.admin.ch/bag/en/home/krankheiten/ausbrueche-epidemien-pandemien/aktuelle-ausbrueche-epidemien/novel-cov/situation-schweiz-und-international.html" read_html(url) %>% html_table() %>% .[[1]] ## --------------------------------- library(pageviews) top_articles("en.wikipedia", start = (Sys.Date()-1)) %>% select(article, views) %>% top_n(10) ## --------------------------------- html_page <- '

Webscraping with R

Basic experience with R and familiarity with Tidyverse is recommended.

Technologies

  1. HTML: Hypertext Markup Language
  2. CSS: Cascading Style Sheets

Packages

Note: rvest is included in the tidyverse

. ' ## --------------------------------- read_html(html_page) %>% xml_structure() ## --------------------------------- html_page_css <- '

Webscraping with R

Basic experience with R and familiarity with the Tidyverse is recommended.

Technologies

  1. HTML: Hypertext Markup Language
  2. CSS: Cascading Style Sheets

Packages

Note: rvest is included in the tidyverse.

' ## --------------------------------- read_html(html_page) %>% html_nodes("li") ## --------------------------------- read_html(html_page) %>% html_nodes("li") %>% html_text() ## --------------------------------- read_html(html_page) %>% html_nodes("em") %>% html_text() ## --------------------------------- read_html(html_page) %>% html_nodes("li, em") %>% html_text() ## --------------------------------- read_html(html_page) %>% html_nodes("li em") %>% html_text() ## --------------------------------- read_html(html_page) %>% html_nodes("em li") %>% html_text() ## --------------------------------- read_html(html_page) %>% html_nodes("p em") %>% html_text() ## --------------------------------- read_html(html_page) %>% html_nodes("p > em") %>% html_text() ## --------------------------------- read_html(html_page) %>% html_nodes("em + em") %>% html_text() ## --------------------------------- read_html(html_page) %>% html_nodes("li:first-child") %>% html_text() ## --------------------------------- read_html(html_page) %>% html_nodes("li:nth-child(2)") %>% html_text() ## --------------------------------- read_html(html_page) %>% html_nodes("ol > li:last-child") %>% html_text() ## --------------------------------- read_html(html_page) %>% html_nodes("a") %>% html_attr("href") ## --------------------------------- read_html(html_page) %>% html_nodes("ul a") %>% html_attr("href") ## --------------------------------- basic_table <- '
CountryCapitalPopulation
UKLondon66.65
SwitzerlandBern8.57
' ## --------------------------------- read_html(basic_table) %>% html_table() ## --------------------------------- read_html(html_page_css) %>% html_nodes(".content") %>% html_text() ## --------------------------------- read_html(html_page_css) %>% html_nodes(".content a") %>% html_text() ## --------------------------------- read_html(html_page_css) %>% html_nodes("#intro") %>% html_text() ## --------------------------------- oscar_parsed <- read_html("https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films") oscar_nominees <- oscar_parsed %>% html_table(fill=TRUE) %>% .[[1]] %>% as_data_frame() ## --------------------------------- oscar_nominees ## --------------------------------- movie_title <- oscar_parsed %>% html_nodes("tr[style='background:#EEDD82'] > td:first-child") %>% html_text() ## --------------------------------- movie_title ## --------------------------------- movie_year <- oscar_parsed %>% html_nodes("tr[style='background:#EEDD82'] > td:nth-child(2)") %>% html_text() ## --------------------------------- movie_year ## --------------------------------- oscar_winners <- tibble(title = movie_title, year = movie_year) ## --------------------------------- oscar_winners ## --------------------------------- url <- "https://www.oecd.org/newsroom/publicationsdocuments/bydate/" oecd_parsed <- read_html(url) ## --------------------------------- release_title <- oecd_parsed %>% html_nodes("h4 > a") %>% html_text() ## --------------------------------- release_title ## --------------------------------- release_title <- oecd_parsed %>% html_nodes("h4 > a") %>% html_text() %>% str_trim() ## --------------------------------- release_title ## --------------------------------- release_url <- oecd_parsed %>% html_nodes("h4 > a") %>% html_attr("href") ## --------------------------------- release_url ## --------------------------------- release_url <- oecd_parsed %>% html_nodes("h4 > a") %>% html_attr("href") %>% str_c("https://www.oecd.org", .) ## --------------------------------- release_url ## --------------------------------- release_date <- oecd_parsed %>% html_nodes(".date") %>% html_text() ## --------------------------------- release_date ## --------------------------------- release_date <- oecd_parsed %>% html_nodes(".date") %>% html_text() %>% dmy() ## --------------------------------- release_date ## --------------------------------- release_language <- oecd_parsed %>% html_nodes(".infos > em") %>% html_text() ## --------------------------------- release_language ## --------------------------------- oecd <- tibble( "date" = release_date, "title" = release_title, "language" = release_language, "url" = release_url ) ## --------------------------------- oecd ## --------------------------------- oecd$text <- NA for (i in seq_along(oecd$url)){ print(str_c("Loading... ", oecd$url[i])) oecd$text[i] <- read_html(oecd$url[i]) %>% html_nodes("#webEditContent") %>% html_text() %>% str_trim() Sys.sleep(10) } ## --------------------------------- oecd ## --------------------------------- oecd$text[1] ## --------------------------------- next_page <- oecd_parsed %>% html_nodes(".currentpage + a") %>% html_attr("href") ## --------------------------------- next_page ## --------------------------------- url_list <- oecd_parsed %>% html_nodes(".paginate > div:nth-child(1) > a") %>% html_attr("href") ## --------------------------------- url_list ## --------------------------------- url <- "https://www.bis.org/press/pressrels.htm?r=1" read_html(url) %>% html_nodes("td div.title") %>% html_text() ## --------------------------------- library(RSelenium) # Load RSelenium server and client rd <- rsDriver(browser = "firefox") remDr <- rd[["client"]] # Go to URL remDr$navigate(url) ## --------------------------------- # Parse HTML parsed_bis <- read_html(remDr$getPageSource()[[1]]) ## --------------------------------- parsed_bis %>% html_nodes("td div.title") %>% html_text() %>% str_trim() ## --------------------------------- read_html(remDr$getPageSource()[[1]]) %>% html_nodes(".pageof") %>% html_text() ## --------------------------------- # Select and click "Next" next_btn <- remDr$findElement(using = "css", ".listbottom span.icon.icon-chevron-right") next_btn$clickElement() ## --------------------------------- read_html(remDr$getPageSource()[[1]]) %>% html_nodes(".pageof") %>% html_text() ## --------------------------------- # Stop RSelenium client and server remDr$closeall() rd$server$stop() ## --------------------------------- # Building target URL url_base <- "https://www.admin.ch" url_releases <- "/gov/en/start/documentation/media-releases.html" url_start_date <- "?dyn_startDate=" url_end_date <- "&dyn_endDate=" url_page_number <- "&dyn_pageIndex=" url_organization <- "&dyn_organization=" start_date <- "01.10.2020" end_date <- "31.10.2020" page_number <- "0" organization_number <- "1" url <- str_c(url_base, url_releases, url_start_date, start_date, url_end_date, end_date, url_organization, organization_number, url_page_number, page_number) ## --------------------------------- url