当前,我正在一次爬网多个网站,并且需要爬网下一页,将从该爬网站点获取指向下一页的链接。因此需要连续抓取每页的下一页。请注意,每页的第二页具有相同的div内容。
Spider.py
class UstodaySpider(scrapy.Spider):
name = 'usatoday'
start_urls = ['https://en.wikipedia.org/wiki/India','https://en.wikipedia.org/wiki/USA
]
def parse(self,response):
items = MynewsItem()
print ("**********************************")
print (type(response))
print (response.url)
all_section = response.css(' a.gnt_m_flm_a ')
for quote in all_section:
news_provider_id = '14'
news_title = quote.css('a.gnt_m_flm_a').xpath("text()").extract()
news_details = quote.css('a.gnt_m_flm_a').xpath("@data-c-br").extract()
news_image = quote.css("img.gnt_m_flm_i").xpath("@data-gl-srcset").extract()
news_page_url = quote.css('a.gnt_m_flm_a').xpath("@href").extract()
items['news_provider_id'] = news_provider_id
items['news_title'] = news_title
items['news_details'] = news_details
items['news_image'] = news_image
items['news_page_url'] = news_page_url
yield items
next_page = 'https://en.wikipedia.org/wiki/India' + str(news_page_url)
print(next_page)
Pipeline.py
import mysql
class MynewsPipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = mysql.connector.connect(
host = 'localhost',user = 'root',password = '',database = 'mydb',port = '3306'
)
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS news_crawl_newsdetails""")
self.curr.execute("""create table news_crawl_newsdetails(
news_provider_id text,news_title text,news_details text,news_image text,news_page_url text
)""" )
def process_item(self,item,spider):
self.store_db(item)
return item
def store_db(self,item):
# print (item['news_title'][0])
self.curr.execute("""insert into news_crawl_newsdetails (news_provider_id,news_title,news_details,news_image,news_page_url) values (%s,%s,%s)""",(
item['news_provider_id'],item['news_title'][0],item['news_details'][0],item['news_image'][0],item['news_page_url'][0]
))
self.conn.commit()
Items.py
import scrapy
class MynewsItem(scrapy.Item):
news_provider_id = scrapy.Field()
news_title = scrapy.Field()
news_details = scrapy.Field()
news_image = scrapy.Field()
news_page_url = scrapy.Field()
news_des = scrapy.Field()
pass