在使用Python中的Scrapy循环检索多个网页以及从那里爬到下一页时需要帮助

当前,我正在一次爬网多个网站,并且需要爬网下一页,将从该爬网站点获取指向下一页的链接。因此需要连续抓取每页的下一页。请注意,每页的第二页具有相同的div内容。

Spider.py

class UstodaySpider(scrapy.Spider):

name = 'usatoday'

start_urls = ['https://en.wikipedia.org/wiki/India','https://en.wikipedia.org/wiki/USA
              ]

def parse(self,response):
    items = MynewsItem()
    print ("**********************************")
    print (type(response))
    print (response.url)

    all_section = response.css(' a.gnt_m_flm_a ')

    for quote in all_section:
        news_provider_id = '14'
        news_title = quote.css('a.gnt_m_flm_a').xpath("text()").extract()
        news_details = quote.css('a.gnt_m_flm_a').xpath("@data-c-br").extract()
        news_image = quote.css("img.gnt_m_flm_i").xpath("@data-gl-srcset").extract()
        news_page_url = quote.css('a.gnt_m_flm_a').xpath("@href").extract()


        items['news_provider_id'] = news_provider_id
        items['news_title'] = news_title
        items['news_details'] = news_details
        items['news_image'] = news_image
        items['news_page_url'] = news_page_url

    yield items
    next_page = 'https://en.wikipedia.org/wiki/India' + str(news_page_url)
    print(next_page)

Pipeline.py

import mysql
class MynewsPipeline(object):
 def __init__(self):
   self.create_connection()
    self.create_table()
 def create_connection(self):
    self.conn = mysql.connector.connect(

        host = 'localhost',user = 'root',password = '',database = 'mydb',port = '3306'
    )

    self.curr = self.conn.cursor()

 def create_table(self):

    self.curr.execute("""DROP TABLE IF EXISTS news_crawl_newsdetails""")
    self.curr.execute("""create table news_crawl_newsdetails(
                    news_provider_id text,news_title text,news_details text,news_image text,news_page_url text
                    )""" )

 def process_item(self,item,spider):
    self.store_db(item)
    return item
 def store_db(self,item):
    # print (item['news_title'][0])

     self.curr.execute("""insert into news_crawl_newsdetails (news_provider_id,news_title,news_details,news_image,news_page_url) values (%s,%s,%s)""",(

        item['news_provider_id'],item['news_title'][0],item['news_details'][0],item['news_image'][0],item['news_page_url'][0]

    ))

    self.conn.commit()

Items.py

import scrapy
class MynewsItem(scrapy.Item):
  news_provider_id = scrapy.Field()
  news_title = scrapy.Field()
  news_details = scrapy.Field()
  news_image = scrapy.Field()
  news_page_url = scrapy.Field()
  news_des = scrapy.Field()
  pass
tonghai0709 回答:在使用Python中的Scrapy循环检索多个网页以及从那里爬到下一页时需要帮助

您可以尝试这种方法:

您应该找到next_page xpath。它可能是指向下一页的链接或按钮:

next_page = response.selector.xpath(--xpath expression--).extract_first()

if next_page is not None:
    next_page_link = response.urljoin(next_page)
    yield scrapy.Request(url = next_page_link,callback=self.parse)

这是解析函数的外观

def parse(self,response):
    items = MynewsItem()
    print ("**********************************")
    print (type(response))
    print (response.url)

    all_section = response.css(' a.gnt_m_flm_a ')

    for quote in all_section:
        news_provider_id = '14'
        news_title = quote.css('a.gnt_m_flm_a').xpath("text()").extract()
        news_details = quote.css('a.gnt_m_flm_a').xpath("@data-c-br").extract()
        news_image = quote.css("img.gnt_m_flm_i").xpath("@data-gl-srcset").extract()
        news_page_url = quote.css('a.gnt_m_flm_a').xpath("@href").extract()


        items['news_provider_id'] = news_provider_id
        items['news_title'] = news_title
        items['news_details'] = news_details
        items['news_image'] = news_image
        items['news_page_url'] = news_page_url

     next_page = response.selector.xpath("").extract_first()

     if next_page is not None:
         next_page_link = response.urljoin(next_page)
         yield scrapy.Request(url= next_page_link,callback=self.parse)
本文链接:https://www.f2er.com/3118540.html

大家都在问