我可以使用CrawlerRunner或CrawlerProcess从另一个Python脚本调用一个抓爬虫。但是,当我尝试从pywikibot机器人调用同一蜘蛛调用类时,出现ReactorNotRestartable错误。为什么会这样,我该如何解决?
这是错误:
File ".\scripts\userscripts\ReplicationWiki\RWLoad.py",line 161,in format_new_page
aea = AEAMetadata(url=DOI_url)
File ".\scripts\userscripts\ReplicationWiki\GetaEAMetadata.py",line 39,in __init__
reactor.run() # the script will block here until all crawling jobs are finished
File "C:\Users\lextr\.conda\envs\py37\lib\site-packages\twisted\internet\base.py",line 1282,in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "C:\Users\lextr\.conda\envs\py37\lib\site-packages\twisted\internet\base.py",line 1262,in startRunning
ReactorBase.startRunning(self)
File "C:\Users\lextr\.conda\envs\py37\lib\site-packages\twisted\internet\base.py",line 765,in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
CRITICAL: Exiting due to uncaught exception <class 'twisted.internet.error.ReactorNotRestartable'>
这是一个叫我沙皮蜘蛛的脚本。如果我只是从main调用类,它将运行正常。
from twisted.internet import reactor,defer
from scrapy import signals
from scrapy.crawler import Crawler,CrawlerProcess,CrawlerRunner
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
from Scrapers.spiders.ScrapeAEA import ScrapeaeaSpider
class AEAMetadata:
"""
Helper to run ScrapeAEA spider and return JEL codes and data links
for a given AEA article link.
"""
def __init__(self,*args,**kwargs):
"""Initializer"""
url = kwargs.get('url')
if not url:
raise ValueError('No article url given')
self.items = []
def collect_items(item,response,spider):
self.items.append(item)
settings = get_project_settings()
crawler = Crawler(ScrapeaeaSpider,settings)
crawler.signals.connect(collect_items,signals.item_scraped)
runner = CrawlerRunner(settings)
d = runner.crawl(crawler,url=url)
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until all crawling jobs are finished
#process = CrawlerProcess(settings)
#process.crawl(crawler,url=url)
#process.start() # the script will block here until the crawling is finished
def get_jelcodes(self):
jelcodes = self.items[0]['jelcodes']
return jelcodes
def main():
aea = AEAMetadata(url='https://doi.org/10.1257/app.20180286')
jelcodes = aea.get_jelcodes()
print(jelcodes)
if __name__ == '__main__':
main()
更新了简单的Test,该实例两次实例化AEAMetadata类。 这是我的pywikibot机器人中失败的调用代码:
from GetaEAMetadata import AEAMetadata
def main(*args):
for _ in [1,2]:
print('Top')
url = 'https://doi.org/10.1257/app.20170442'
aea = AEAMetadata(url=url)
print('After AEAMetadata')
jelcodes = aea.get_jelcodes()
print(jelcodes)
if __name__ == '__main__':
main()