带循环的网页抓取

2024-05-19 • 问答

我正在尝试使用循环功能从网站上抓取一些文本，但是在选择向量列表中的下一项时，循环功能并没有移动。感谢任何有用的建议。谢谢

library(rvest)
library(xml2)


ID <- c(1:2)
Land <- c('Afghanistan','Ägypten')
url <- c('afghanistan','aegypten') 
Text <- (NA)

data <- data.frame(ID,Land,Text)

for(i in url) {
  nam <- paste("https://www.reporter-ohne-grenzen.de",i,sep = "/")
  assign(nam,i)

  webpage <- read_html(paste0(nam,i))
  data$Text <- i <- webpage %>% html_nodes('div.text') %>% .[[1]] %>% html_text() 
}

嗯，不确定我是否明确了我的问题。这是我想要的数据输出的示例。

library(rvest)
library(xml2)

ID <- c(1:2)
Land <- c('Afghanistan',Text)


afghanistan <- 'https://www.reporter-ohne-grenzen.de/afghanistan'
afghanistan <- read_html(afghanistan)
afghanistan <- html_nodes(afghanistan,'div.text')
afghanistan <- html_text(afghanistan)[[1]]

aegypten <- 'https://www.reporter-ohne-grenzen.de/aegypten'
aegypten <- read_html(aegypten)
aegypten <- html_nodes(aegypten,'div.text')
aegypten <- html_text(aegypten)[[1]]

# desired data output
data$Text <- c(afghanistan,aegypten)

我不想在180个国家/地区重复这些行。

aegypten <- 'https://www.reporter-ohne-grenzen.de/aegypten'
aegypten <- read_html(aegypten)
aegypten <- html_nodes(aegypten,'div.text')
aegypten <- html_text(aegypten)[[1]]

以下是解决方法：

library(rvest)
library(xml2)

ID <- c(1:4) 
Land <- c('Afghanistan','Ägypten','Deutschland','Italien')
Url <- c('afghanistan','aegypten','deutschland','italien') 
Text <- NA

data <- data.frame(ID,Text)
website <- 'https://www.reporter-ohne-grenzen.de'

for (i in ID) {
  country <- Url[i]

  html_url <- paste(website,country,sep='/')
  output <- read_html(html_url)
  output <- html_nodes(output,'div.text')
  output <- html_text(output)[[1]]

  data$Text[i] <- output
}

library(rvest) #> Loading required package: xml2 library(xml2) library(tidyverse) ID <- c(1:2) Land <- c('Afghanistan','Ägypten') url <- c('afghanistan','aegypten') Text <- (NA) data <- data.frame(ID,Land,url,Text) read_country <- function(country_url){ nam <- paste0("https://www.reporter-ohne-grenzen.de/",country_url) webpage <- read_html(paste0(nam)) webpage %>% html_nodes('div.text') %>% .[[1]] %>% html_text() } data <- data %>% mutate(Text = map_chr(url,read_country))

library(purrr) library(rvest) data$Text <- map(paste0("https://www.reporter-ohne-grenzen.de/",url),~.x %>% read_html %>% html_nodes('div.text') %>% html_text %>% .[[1]]) %>% flatten_chr()

带循环的网页抓取

HEDEWEN830 回答：带循环的网页抓取

大家都在问