假设我有一个字符串列表:
strings <- c("dog","cat","animal","bird","birds","bqpohd","ohphha","mqphihpha","aphhphohpa","pohha")
我想检查一下这些字符串是否是Wikipedia文章的标题。
这是一个解决方案,但我认为对于长列表来说,这不是最快的方法:
results.df <- CheckIfAStringIsTheTitleOfAWikipediaArticle(strings)
View(results.df)
CheckIfAStringIsTheTitleOfAWikipediaArticle <- function(strings){
start_time <- Sys.time()
Check <- function(string){
GetPageID <- function(string){
query <- paste0("https://en.wikipedia.org/w/api.php?","action=query","&format=xml","&titles=",string)
answer <- httr::GET(query)
library(xml2)
library(httr)
page.xml <- xml2::read_xml(answer)
nodes <- xml_find_all(page.xml,".//query//pages//page")
pageid <- xml_attr(nodes,"pageid",ns = character(),default = NA_character_)
return(pageid)
}
IsValidPageName <- function(string){
pageid<- GetPageID(string)
if(!is.na(pageid)){return(TRUE)}
else{return(FALSE)}
}
boolean <- IsValidPageName(string)
return(boolean)
}
validTitle <- unlist(lapply(strings,Check))
results.df <- data.frame(strings,validTitle)
end_time <- Sys.time()
time <- end_time - start_time
print(time)
return(results.df)
}
非常感谢您的帮助!