Hugo Future Imperfect Slim

Bryan Adams' Blog

Beat Navy!

Bryan Adams

6 minute read

I was approached to see if there was a way to web scrape several articles from PubMed. I normally like to use rvest for web scraping, but it would not work because of the java script used to run PubMed. Luckily, RSelenium provides a way to easily interact with websites of this nature. This was my first use of RSelenium, but now I have a lot more efficient ways of using RSelenium and rvest together, although I am sure there are still better ways out there.

The basic scrape function

This function is used to pull a lot of information from PubMed for each article. At first I thought I would have to make multiple functions, but this works to pull everything from authors to abstracts.

base_info_function = function(article){
  
  article$getElementAttribute("outerHTML")[[1]]%>%
    read_html()%>%
    html_text()%>%
    as.data.frame()
  
}

Setup only do once

driver = rsDriver(browser = 'firefox')
remote_driver = driver$client
base_url = 'https://www.ncbi.nlm.nih.gov/pubmed'

remote_driver$navigate(base_url)

A function that will search for all of the terms we want to search for.

When I web scrape I like to make functions. This is a very long function, but serves the purpose. Had I used rvest I would have been able to make it simpler.

scrape_by_vitamen = function(term){
  
  # Go to the homepage
  remote_driver$navigate(base_url)
  
  #Find the search field
  search_field = remote_driver$findElement(using = 'css', value = '#term')
  
  #Find the search button
  search_button = remote_driver$findElement(using = 'css', value = '#search')
  
  #Send the term
  search_field$sendKeysToElement(list(term))

  #Click Search
  search_button$clickElement()
  
  #Display 100 items
  
  #catch if not enough articles for the option to appear
  catch = tryCatch({
  remote_driver$findElement(using = 'css',
    value = 'li:nth-child(3) .tgt_dark')}, error = function(e){c("ERROR")})
  
  if(is.character(catch)){
    
  }
  else{
    page_per_button = remote_driver$findElement(using = 'css',
      value = 'li:nth-child(3) .tgt_dark')
  
    page_per_button$clickElement()
  
    display_100_button = remote_driver$findElement(using = 'css', value = '#ps100')
  
    display_100_button$clickElement()
    
    Sys.sleep(15)
  }
  
  #Display abstracts
  
  #If there is not an abstract button, then probably no articles
  
  catch = tryCatch({
  remote_driver$findElement(using = 'css', value = 'li:nth-child(1) .tgt_dark')}, error = function(e){c("ERROR")})
  
  if(is.character(catch)){
    
    num_articles = remote_driver$findElement(using = 'css', value = '.result_count')
    
    num = num_articles$getElementAttribute("outerHTML")[[1]]%>%
      read_html()%>%
      html_text()%>%
      sub(".*\\s","",.)
    
     df = cbind(term, num)%>%
       as.data.frame()
  }
  else{
    
    num_articles = remote_driver$findElement(using = 'css', value = '.result_count')
    
    num = num_articles$getElementAttribute("outerHTML")[[1]]%>%
      read_html()%>%
      html_text()%>%
      sub(".*\\s","",.)
    
    format_button = remote_driver$findElement(using = 'css', value = 'li:nth-child(1) .tgt_dark')
  
    format_button$clickElement()
    
    Sys.sleep(10)
    
    abstract_button = remote_driver$findElement(using = 'css', value = '#abstract')
    
    abstract_button$clickElement()
    
    Sys.sleep(10)
    #Apply abstract filter
    filter_abstracts = remote_driver$findElement(using = 'id', value = 'EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Entrez_Filters.Filter')
  
    filter_abstracts$clickElement()
    
    Sys.sleep(10)
    #Scrape titles
    
    article_titles = remote_driver$findElements(using = 'css', value = "#maincontent h1 a")
  
    titles = article_titles%>%
      purrr::map(base_info_function)%>%
      bind_rows()%>%
      janitor::clean_names()%>%
      rename(Title = x)
    
    #Scrape Authors
    
    article_authors = remote_driver$findElements(using = 'css', value = ".auths, .aff p")
  
    if(length(article_authors)!=length(article_titles)){
        article_authors = remote_driver$findElements(using = 'css', value = ".auths")
    }
    
    authors = article_authors%>%
      purrr::map(base_info_function)%>%
      bind_rows()%>%
      janitor::clean_names()%>%
      rename(Authors = x)
    
    #Scrape Abstracts
    article_abstracts = remote_driver$findElements(using = 'css', value = '.abstr')
  
    abstracts = article_abstracts%>%
      purrr::map(base_info_function)%>%
      bind_rows()%>%
      janitor::clean_names()%>%
      rename(Abstract = x)%>%
      mutate(Abstract = str_replace(Abstract, pattern = "Abstract", replacement = ""))%>%
      mutate(Cohort = case_when(
        str_detect(Abstract, pattern = "cohort|Cohort") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(Observational = case_when(
        str_detect(Abstract, pattern = "observational|Observational") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(RCT = case_when(
        str_detect(Abstract, pattern = "RCT|randomized controlled trials") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(`Meta-analysis` = case_when(
        str_detect(Abstract, pattern = "meta-analysis") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(`Systematic review` = case_when(
        str_detect(Abstract, pattern = "systematic review") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(`Clinical trial` = case_when(
        str_detect(Abstract, pattern = "clinical trial") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(HbA1c = case_when(
        str_detect(Abstract, pattern = "HbA1c") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(`Fasting glucose` = case_when(
        str_detect(Abstract, pattern = "fasting glucose") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(`Insulin` = case_when(
        str_detect(Abstract, pattern = "insulin") == T ~ 1,
        TRUE ~ 0 
      ))
      
    
    #Scrape Citations
    article_citations = remote_driver$findElements(using = 'css', value = '.aff p , .cit' )
  
    citations = article_citations%>%
      purrr::map(base_info_function)%>%
      bind_rows()%>%
      janitor::clean_names()%>%
      rename(Citation = x)%>%
      mutate(Journal = sub("\\..*","", Citation))%>%
      mutate(Year = str_extract(Citation, '(?<=\\s)\\d+'))%>%
      mutate(url = sub(".*doi\\:","", Citation))%>%
      mutate(url2 = sub("\\.\\s.*$","",url))%>%
      mutate(url3 = sub("[.]$","",url2))%>%
      mutate(url4 = trimws(url3))%>%
      mutate(url5 = paste("https://doi.org/",url4,sep = ""))%>%
      rename(URL = url5, DOI = url4)%>%
      select(Journal,Year,Citation,URL,DOI)
    
    ARTICLES = cbind(titles, authors, citations, abstracts)
    
    ARTICLES%>%
      write_csv(paste0(term,".csv"))
    
    article_summary = ARTICLES%>%
      summarise(Cohort = sum(Cohort),
                Observational = sum(Observational),
                RCT = sum(RCT),
                `Meta-Analysis` = sum(`Meta-analysis`),
                `Systematic Review` = sum(`Systematic review`),
                `Clinical trial` = sum(`Clinical trial`),
                HbA1c = sum(HbA1c),
                `Fasting glucose` = sum(`Fasting glucose`),
                `Insulin` = sum(`Insulin`))
    
    df = cbind(term, num)%>%
      as.data.frame()
    
    df = cbind(df, article_summary)
  }
  
}

Call the function to scrape

scrape_by_vitamen(search_terms[2,1])

summary = search_terms%>%
  unlist()%>%
  purrr::map(scrape_by_vitamen)%>%
  bind_rows()

summary%>%
  write_csv("Summary.csv")

Get Potassium and Chromium

article_titles = remote_driver$findElements(using = 'css', value = "#maincontent h1 a")
  
    titles = article_titles%>%
      purrr::map(base_info_function)%>%
      bind_rows()%>%
      janitor::clean_names()%>%
      rename(Title = x)
    
article_authors = remote_driver$findElements(using = 'css', value = ".auths")

authors = article_authors%>%
      purrr::map(base_info_function)%>%
      bind_rows()%>%
      janitor::clean_names()%>%
      rename(Authors = x)%>%
      slice(-32)

article_abstracts = remote_driver$findElements(using = 'css', value = '.abstr')
  
    abstracts = article_abstracts%>%
      purrr::map(base_info_function)%>%
      bind_rows()%>%
      janitor::clean_names()%>%
      rename(Abstract = x)%>%
      mutate(Abstract = str_replace(Abstract, pattern = "Abstract", replacement = ""))%>%
      mutate(Cohort = case_when(
        str_detect(Abstract, pattern = "cohort|Cohort") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(Observational = case_when(
        str_detect(Abstract, pattern = "observational|Observational") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(RCT = case_when(
        str_detect(Abstract, pattern = "RCT|randomized controlled trials") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(`Meta-analysis` = case_when(
        str_detect(Abstract, pattern = "meta-analysis") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(`Systematic review` = case_when(
        str_detect(Abstract, pattern = "systematic review") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(`Clinical trial` = case_when(
        str_detect(Abstract, pattern = "clinical trial") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(HbA1c = case_when(
        str_detect(Abstract, pattern = "HbA1c") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(`Fasting glucose` = case_when(
        str_detect(Abstract, pattern = "fasting glucose") == T ~ 1,
        TRUE ~ 0 
      ))%>%
      mutate(`Insulin` = case_when(
        str_detect(Abstract, pattern = "insulin") == T ~ 1,
        TRUE ~ 0 
      ))
    
    article_citations = remote_driver$findElements(using = 'css', value = '.aff p , .cit' )
  
    citations = article_citations%>%
      purrr::map(base_info_function)%>%
      bind_rows()%>%
      janitor::clean_names()%>%
      rename(Citation = x)%>%
      mutate(Journal = sub("\\..*","", Citation))%>%
      mutate(Year = str_extract(Citation, '(?<=\\s)\\d+'))%>%
      mutate(url = sub(".*doi\\:","", Citation))%>%
      mutate(url2 = sub("\\.\\s.*$","",url))%>%
      mutate(url3 = sub("[.]$","",url2))%>%
      mutate(url4 = trimws(url3))%>%
      mutate(url5 = paste("https://doi.org/",url4,sep = ""))%>%
      rename(URL = url5, DOI = url4)%>%
      select(Journal,Year,Citation,URL,DOI)
    
    ARTICLES = cbind(titles, authors, citations, abstracts)
    
    ARTICLES%>%
      write_csv("(Chromium supplement) and Diabetes.csv")
article_summary = ARTICLES%>%
      summarise(Cohort = sum(Cohort),
                Observational = sum(Observational),
                RCT = sum(RCT),
                `Meta-Analysis` = sum(`Meta-analysis`),
                `Systematic Review` = sum(`Systematic review`),
                `Clinical trial` = sum(`Clinical trial`),
                HbA1c = sum(HbA1c),
                `Fasting glucose` = sum(`Fasting glucose`),
                `Insulin` = sum(`Insulin`))

Recent posts

See more

Categories

About

I am an assistant professor at the United States Military Academy. I currently teach MA206Y: Introduction to Data Science and Statistics. I have served in the Army for 11 years and have a beuatiful wife and two wonderful children