I was approached to see if there was a way to web scrape several articles from PubMed. I normally like to use rvest for web scraping, but it would not work because of the java script used to run PubMed. Luckily, RSelenium provides a way to easily interact with websites of this nature. This was my first use of RSelenium, but now I have a lot more efficient ways of using RSelenium and rvest together, although I am sure there are still better ways out there.
The basic scrape function
This function is used to pull a lot of information from PubMed for each article. At first I thought I would have to make multiple functions, but this works to pull everything from authors to abstracts.
base_info_function = function(article){
article$getElementAttribute("outerHTML")[[1]]%>%
read_html()%>%
html_text()%>%
as.data.frame()
}
Ogranize the vitamens to search
vitamens = read_csv("Count.csv")
vitamens_search = vitamens%>%
filter(n>=100)%>%
select(Content)%>%
filter(Content!="Potassium")%>%
filter(Content!="Chromium")%>%
filter(Content!="Proprietary Blend")
search_terms = c(paste0("(", vitamens," supplement) AND Diabetes"))
search_terms = vitamens_search%>%
transmute(vitamens = paste0("(", Content," supplement) AND Diabetes"))
Setup only do once
driver = rsDriver(browser = 'firefox')
remote_driver = driver$client
base_url = 'https://www.ncbi.nlm.nih.gov/pubmed'
remote_driver$navigate(base_url)
A function that will search for all of the terms we want to search for.
When I web scrape I like to make functions. This is a very long function, but serves the purpose. Had I used rvest I would have been able to make it simpler.
scrape_by_vitamen = function(term){
# Go to the homepage
remote_driver$navigate(base_url)
#Find the search field
search_field = remote_driver$findElement(using = 'css', value = '#term')
#Find the search button
search_button = remote_driver$findElement(using = 'css', value = '#search')
#Send the term
search_field$sendKeysToElement(list(term))
#Click Search
search_button$clickElement()
#Display 100 items
#catch if not enough articles for the option to appear
catch = tryCatch({
remote_driver$findElement(using = 'css',
value = 'li:nth-child(3) .tgt_dark')}, error = function(e){c("ERROR")})
if(is.character(catch)){
}
else{
page_per_button = remote_driver$findElement(using = 'css',
value = 'li:nth-child(3) .tgt_dark')
page_per_button$clickElement()
display_100_button = remote_driver$findElement(using = 'css', value = '#ps100')
display_100_button$clickElement()
Sys.sleep(15)
}
#Display abstracts
#If there is not an abstract button, then probably no articles
catch = tryCatch({
remote_driver$findElement(using = 'css', value = 'li:nth-child(1) .tgt_dark')}, error = function(e){c("ERROR")})
if(is.character(catch)){
num_articles = remote_driver$findElement(using = 'css', value = '.result_count')
num = num_articles$getElementAttribute("outerHTML")[[1]]%>%
read_html()%>%
html_text()%>%
sub(".*\\s","",.)
df = cbind(term, num)%>%
as.data.frame()
}
else{
num_articles = remote_driver$findElement(using = 'css', value = '.result_count')
num = num_articles$getElementAttribute("outerHTML")[[1]]%>%
read_html()%>%
html_text()%>%
sub(".*\\s","",.)
format_button = remote_driver$findElement(using = 'css', value = 'li:nth-child(1) .tgt_dark')
format_button$clickElement()
Sys.sleep(10)
abstract_button = remote_driver$findElement(using = 'css', value = '#abstract')
abstract_button$clickElement()
Sys.sleep(10)
#Apply abstract filter
filter_abstracts = remote_driver$findElement(using = 'id', value = 'EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Entrez_Filters.Filter')
filter_abstracts$clickElement()
Sys.sleep(10)
#Scrape titles
article_titles = remote_driver$findElements(using = 'css', value = "#maincontent h1 a")
titles = article_titles%>%
purrr::map(base_info_function)%>%
bind_rows()%>%
janitor::clean_names()%>%
rename(Title = x)
#Scrape Authors
article_authors = remote_driver$findElements(using = 'css', value = ".auths, .aff p")
if(length(article_authors)!=length(article_titles)){
article_authors = remote_driver$findElements(using = 'css', value = ".auths")
}
authors = article_authors%>%
purrr::map(base_info_function)%>%
bind_rows()%>%
janitor::clean_names()%>%
rename(Authors = x)
#Scrape Abstracts
article_abstracts = remote_driver$findElements(using = 'css', value = '.abstr')
abstracts = article_abstracts%>%
purrr::map(base_info_function)%>%
bind_rows()%>%
janitor::clean_names()%>%
rename(Abstract = x)%>%
mutate(Abstract = str_replace(Abstract, pattern = "Abstract", replacement = ""))%>%
mutate(Cohort = case_when(
str_detect(Abstract, pattern = "cohort|Cohort") == T ~ 1,
TRUE ~ 0
))%>%
mutate(Observational = case_when(
str_detect(Abstract, pattern = "observational|Observational") == T ~ 1,
TRUE ~ 0
))%>%
mutate(RCT = case_when(
str_detect(Abstract, pattern = "RCT|randomized controlled trials") == T ~ 1,
TRUE ~ 0
))%>%
mutate(`Meta-analysis` = case_when(
str_detect(Abstract, pattern = "meta-analysis") == T ~ 1,
TRUE ~ 0
))%>%
mutate(`Systematic review` = case_when(
str_detect(Abstract, pattern = "systematic review") == T ~ 1,
TRUE ~ 0
))%>%
mutate(`Clinical trial` = case_when(
str_detect(Abstract, pattern = "clinical trial") == T ~ 1,
TRUE ~ 0
))%>%
mutate(HbA1c = case_when(
str_detect(Abstract, pattern = "HbA1c") == T ~ 1,
TRUE ~ 0
))%>%
mutate(`Fasting glucose` = case_when(
str_detect(Abstract, pattern = "fasting glucose") == T ~ 1,
TRUE ~ 0
))%>%
mutate(`Insulin` = case_when(
str_detect(Abstract, pattern = "insulin") == T ~ 1,
TRUE ~ 0
))
#Scrape Citations
article_citations = remote_driver$findElements(using = 'css', value = '.aff p , .cit' )
citations = article_citations%>%
purrr::map(base_info_function)%>%
bind_rows()%>%
janitor::clean_names()%>%
rename(Citation = x)%>%
mutate(Journal = sub("\\..*","", Citation))%>%
mutate(Year = str_extract(Citation, '(?<=\\s)\\d+'))%>%
mutate(url = sub(".*doi\\:","", Citation))%>%
mutate(url2 = sub("\\.\\s.*$","",url))%>%
mutate(url3 = sub("[.]$","",url2))%>%
mutate(url4 = trimws(url3))%>%
mutate(url5 = paste("https://doi.org/",url4,sep = ""))%>%
rename(URL = url5, DOI = url4)%>%
select(Journal,Year,Citation,URL,DOI)
ARTICLES = cbind(titles, authors, citations, abstracts)
ARTICLES%>%
write_csv(paste0(term,".csv"))
article_summary = ARTICLES%>%
summarise(Cohort = sum(Cohort),
Observational = sum(Observational),
RCT = sum(RCT),
`Meta-Analysis` = sum(`Meta-analysis`),
`Systematic Review` = sum(`Systematic review`),
`Clinical trial` = sum(`Clinical trial`),
HbA1c = sum(HbA1c),
`Fasting glucose` = sum(`Fasting glucose`),
`Insulin` = sum(`Insulin`))
df = cbind(term, num)%>%
as.data.frame()
df = cbind(df, article_summary)
}
}
Call the function to scrape
scrape_by_vitamen(search_terms[2,1])
summary = search_terms%>%
unlist()%>%
purrr::map(scrape_by_vitamen)%>%
bind_rows()
summary%>%
write_csv("Summary.csv")
Get Potassium and Chromium
article_titles = remote_driver$findElements(using = 'css', value = "#maincontent h1 a")
titles = article_titles%>%
purrr::map(base_info_function)%>%
bind_rows()%>%
janitor::clean_names()%>%
rename(Title = x)
article_authors = remote_driver$findElements(using = 'css', value = ".auths")
authors = article_authors%>%
purrr::map(base_info_function)%>%
bind_rows()%>%
janitor::clean_names()%>%
rename(Authors = x)%>%
slice(-32)
article_abstracts = remote_driver$findElements(using = 'css', value = '.abstr')
abstracts = article_abstracts%>%
purrr::map(base_info_function)%>%
bind_rows()%>%
janitor::clean_names()%>%
rename(Abstract = x)%>%
mutate(Abstract = str_replace(Abstract, pattern = "Abstract", replacement = ""))%>%
mutate(Cohort = case_when(
str_detect(Abstract, pattern = "cohort|Cohort") == T ~ 1,
TRUE ~ 0
))%>%
mutate(Observational = case_when(
str_detect(Abstract, pattern = "observational|Observational") == T ~ 1,
TRUE ~ 0
))%>%
mutate(RCT = case_when(
str_detect(Abstract, pattern = "RCT|randomized controlled trials") == T ~ 1,
TRUE ~ 0
))%>%
mutate(`Meta-analysis` = case_when(
str_detect(Abstract, pattern = "meta-analysis") == T ~ 1,
TRUE ~ 0
))%>%
mutate(`Systematic review` = case_when(
str_detect(Abstract, pattern = "systematic review") == T ~ 1,
TRUE ~ 0
))%>%
mutate(`Clinical trial` = case_when(
str_detect(Abstract, pattern = "clinical trial") == T ~ 1,
TRUE ~ 0
))%>%
mutate(HbA1c = case_when(
str_detect(Abstract, pattern = "HbA1c") == T ~ 1,
TRUE ~ 0
))%>%
mutate(`Fasting glucose` = case_when(
str_detect(Abstract, pattern = "fasting glucose") == T ~ 1,
TRUE ~ 0
))%>%
mutate(`Insulin` = case_when(
str_detect(Abstract, pattern = "insulin") == T ~ 1,
TRUE ~ 0
))
article_citations = remote_driver$findElements(using = 'css', value = '.aff p , .cit' )
citations = article_citations%>%
purrr::map(base_info_function)%>%
bind_rows()%>%
janitor::clean_names()%>%
rename(Citation = x)%>%
mutate(Journal = sub("\\..*","", Citation))%>%
mutate(Year = str_extract(Citation, '(?<=\\s)\\d+'))%>%
mutate(url = sub(".*doi\\:","", Citation))%>%
mutate(url2 = sub("\\.\\s.*$","",url))%>%
mutate(url3 = sub("[.]$","",url2))%>%
mutate(url4 = trimws(url3))%>%
mutate(url5 = paste("https://doi.org/",url4,sep = ""))%>%
rename(URL = url5, DOI = url4)%>%
select(Journal,Year,Citation,URL,DOI)
ARTICLES = cbind(titles, authors, citations, abstracts)
ARTICLES%>%
write_csv("(Chromium supplement) and Diabetes.csv")
article_summary = ARTICLES%>%
summarise(Cohort = sum(Cohort),
Observational = sum(Observational),
RCT = sum(RCT),
`Meta-Analysis` = sum(`Meta-analysis`),
`Systematic Review` = sum(`Systematic review`),
`Clinical trial` = sum(`Clinical trial`),
HbA1c = sum(HbA1c),
`Fasting glucose` = sum(`Fasting glucose`),
`Insulin` = sum(`Insulin`))

Twitter
Facebook
Reddit
LinkedIn
StumbleUpon
Pinterest
Email