无法更改Scrapy Spider的设置

我正在尝试从具有某些参数的请求中启动抓取抓取工具:

 msg_req_obj = MessageRequestObject(azureServiceBus=self.azure_service_bus,sbReqq=self.sb_request_queue,sbResQ=self.sb_response_queue,session=message_body['session'],studyName=message_body['studyName'],studyId=message_body['studyId'],strategyId=message_body['strategyId'],requestId=message_body['requestId'],email=message_body['email'],crawlDepth=message_body['crawlDepth'],crawlPageCount=message_body['crawlPageCount'],sites=site_obj_array,msg=message)

该消息基本上传递了第一个URL,以启动蜘蛛,并随每个创建的蜘蛛改变了两个设置: crawlDepth crawlPageCount

我有以下几种获取设置的方法:

  • settings.py文件,其中包含蜘蛛的“默认”设置。
    DEpth_LIMIT = 3
    DEpth_STATS_VERBOSE = True
    我用于在整个项目中添加设置的
  • config_settings.py文件,包括一些用于覆盖settings.py的设置。

    def depth_limit(self):
    _default_depth_limit = 4
    if (self._depth_limit):
        try:
            return int(self._depth_limit)
        except Exception as ex:
            logger.error('"DEpth_LIMIT" is not a number in application settings. Using default value "' + str(_default_depth_limit) + '"')
            return _default_depth_limit
    else:
        print('"DEpth_LIMIT" not found/empty in application settings. Using default value "' + str(_default_depth_limit) + '"')
        return _default_depth_limit
    
  • custom_settings蜘蛛中的settings.py中的设置覆盖config_settings.py文件。

     custom_settings = {
        'ROBOTSTXT_OBEY': configurationSettings.obey_robotstxt,'DEpth_LIMIT': configurationSettings.depth_limit,'DOWNLOAD_DELAY': configurationSettings.download_delay_for_requests,'CLOSESPIDER_PAGECOUNT': configurationSettings.max_responses_to_crawl
    }
  • 通过get_project_settings()获取设置以检索默认设置,然后使用settings.update()方法对其进行更新,传递来自msg_req_obj的值,然后启动crawlerRunner使用以下更新的设置:
    def crawl_sites(self,blob_config,site_urls,msg_req_obj):
        print("SPIDER STARTED")
        print(site_urls)
        s = get_project_settings()
        s.update({
            "DEpth_LIMIT" : msg_req_obj.crawlDepth,"MAX_RESPONSES_TO_CRAWL" : msg_req_obj.crawlPageCount,})        
        self.runner = CrawlerRunner(s)
        self.runner.crawl(GenericSpider,blobConfig=blob_config,msgReqObj=msg_req_obj,urls=site_urls)

        deferred = self.runner.join()
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()

最后一个有效地更改了将要传递到crawlerRunner的设置。但是Spider不会加载这些设置,而是以DEpth_LIMIT=1开头。

我尝试通过其他所有方法(settings.pyconfig_settings.pycustom_settings)对DEpth_LIMIT的不同值进行硬编码,但它们似乎都不起作用,因为蜘蛛程序始终会爬行到深度为1的项目在停止和关闭之前。这样看来,蜘蛛程序没有采取任何这些设置,而是“默认设置”为DEpth_LIMIT = 1。

我想念什么?为了实现此功能,我还应该采取其他步骤吗?

编辑:

这是我的crawlProcess类的代码:

class CrawlProcess(object):
    """description of class"""

    def __init__(self,blob_service,blob_service_output_container_name):
        """
        Constructor
        """        
        self.blob_service = blob_service
        self.blob_service_output_container_name = blob_service_output_container_name
        settings_file_path = 'scrapy_app.scrapy_app.settings' # The path seen from root,ie. from crawlProcess.py
        os.environ.setdefault('SCRAPY_SETTINGS_MODULE',settings_file_path)
        self.runner = ''

    def spider_closing(self,spider):
        """activates on spider closed signal"""
        print("STOPPING SPIDER")
        self.runner.join()

    def crawl_sites(self,})        
        self.runner = CrawlerRunner(s)

        self.runner.crawl(GenericSpider,urls=site_urls)

        deferred = self.runner.join()
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()

    def start_process(self,msg_req_obj):
        blob_config = BlobConfig(blob_service=self.blob_service,blob_container_name=self.blob_service_output_container_name,)

        crawl_sites_process = mp.Process(target=self.crawl_sites,args=(blob_config,msg_req_obj),daemon=True)

        print("STARTING SPIDER")
        crawl_sites_process.start()
        crawl_sites_process.join()
        print("SPIDER STOPPED")
        print("ENGINE STOPPED")   

这是我的GenericSpider的代码:

import scrapy
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse
try:
    from scrapy_app.scrapy_app.items import HtmlItem
except ImportError:
    from scrapy_app.items import HtmlItem

import re
import os
import json
from scrapy_splash.response import SplashJsonResponse
from scrapy.spiders import CrawlSpider,Rule
from scrapy_app.scrapy_app.utils import get_domain_from_url,get_subdomain_from_url
import logging
from config_settings import ConfigurationSettings
from scrapy_splash import SplashRequest

logger = logging.getLogger(__name__)

class GenericSpider(CrawlSpider):
    extractor = LinkExtractor()
    crawl_depth = 0
    name = 'generic'
    configurationSettings = ConfigurationSettings.getInstance()
    handle_httpstatus_list = configurationSettings.handle_http_statuses
    handle_httpstatus_all = configurationSettings.handle_all_http_statuses
    custom_settings = {
        'ROBOTSTXT_OBEY': configurationSettings.obey_robotstxt,'CLOSESPIDER_PAGECOUNT': configurationSettings.max_responses_to_crawl
    }


    logger.setLevel(logging.INFO)
    logging.basicConfig(
           filename='scraping.log',format='%(levelname)s: %(message)s',level=logging.INFO
       )

    def __init__(self,crawler,*args,**kwargs):
        self.crawler = crawler
        self.blobConfig = kwargs.get('blobConfig')
        self.msgReqObj = kwargs.get('msgReqObj')
        self.urls = kwargs.get('urls')
        self.allowed_domains = [urlparse(url).netloc for url in self.urls]
        self.start_urls = self.urls
        self.proxy_pool = self.configurationSettings.proxies

        self.suggestedKeywords = self.configurationSettings.suggested_keywords


        self.rules = [Rule(LinkExtractor(allow=(),allow_domains=self.allowed_domains,canonicalize=True,unique=True,),follow=True,callback="parse_item",process_request="use_splash_request"),]

        self._follow_links = True
        self._domain = ""

        self.failed_urls_dict = {}
        for httpstatus in self.handle_httpstatus_list:
            self.failed_urls_dict[httpstatus] = []


        super(GenericSpider,self).__init__(crawler,**kwargs)


    @classmethod
    def from_crawler(cls,**kwargs):
        # settings = crawler.settings
        return cls(crawler,**kwargs)

    def parse_item(self,response):

        # if self.handle_httpstatus_all or response.status not in self.handle_httpstatus_list:              # Without this line,ALL HTTP Responses are handleds
        item = self._get_item(response)
        yield item

def _get_item(self,response):

        children = []
        # Get parameters from the Response
        _domain = response.meta['url_domain'] if 'url_domain' in response.meta else get_domain_from_url(response.url)
        _subdomain = get_subdomain_from_url(response.url,_domain)
        _comparableId = response.meta['comparableId'] if 'comparableId' in response.meta else 'NA'
        root = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(response.url))
        _html = response.text
        base_tag = response.css("head base").extract()
        if not base_tag:
            _html = _html.replace("</head>","<base href=\"" + root + "/\"></head>")

        #Populate Child pages List
        links = self.extractor.extract_links(response)
        [children.append(link.url) for link in links]


        item = HtmlItem(
            url=response.url,domain=_domain,subdomain=_subdomain,html=_html,description='',title='',is_suggested=str(False),comparable_id=str(_comparableId),is_error=str(False) if 200 <= response.status < 300 else str(True),http_status=response.status,crawl_depth=response.meta['depth'],child_pages=children
        )

        self._set_title(item,response)
        self._set_description(item,response)
        self._is_suggested(item)

        return item

    def _set_title(self,item,response):
        if isinstance(response,SplashJsonResponse) or response.meta['isFirstPage'] == True:
            title = response.css("title::text").extract()
            if title:
                item['title'] = title[0].encode("utf-8")
        else: 
            pass

    def _set_description(self,SplashJsonResponse):
            meta_description = response.css("meta[name=\"description\"]::attr(content)").extract()
            if meta_description:
                item['description'] = meta_description[0].encode("utf-8")

    def _is_suggested(self,item):
        #logger.info('TITLE-DESCRIPTION:- %(title)s ==> %(desc)s',{'title': item['title'],'desc': item['description']})
        _title = item['title'].decode("utf-8") if item['title'] else ''
        _description = item['description'].decode("utf-8") if item['description'] else ''
        try :
            if any(re.search(r'\b' + sug_kwd + r'\b',_title,re.IGNORECASE) for sug_kwd in self.suggestedKeywords) \
              or any(re.search(r'\b' + sug_kwd + r'\b',_description,re.IGNORECASE) for sug_kwd in self.suggestedKeywords):
                item['is_suggested'] = str(True)
        except Exception as ex:
            template = "GenericSpider:- An exception of type {0} occurred. Arguments:\n{1!r}"
            ex_message = template.format(type(ex).__name__,ex.args)
            print(ex_message)

shu545312271 回答:无法更改Scrapy Spider的设置

暂时没有好的解决方案,如果你有好的解决方案,请发邮件至:iooj@foxmail.com
本文链接:https://www.f2er.com/3105820.html

大家都在问