Ⅴ.Re:Scrapy 之 Engine
ExecutionEngine
这个 Engine 负责的是系统中所有组件之间的数据流转与交互。
ExecutionEngine
是Engine
的实例,它会初始化Schedule
、Slot
、Downloader
(Downloader Middlewares
)、Scraper
(Spider Middlewares
、itempipline
)、Stats
等,并对其进行管理、控制。
源码解析
scrapy/crawler.py#Crawler crawl(self, *args, **kwargs)
# crawl(self, crawler_or_spidercls, *args, **kwargs) -->
# _crawl(self, crawler, *args, **kwargs) -->
# crawl(self, *args, **kwargs)
@defer.inlineCallbacks
def crawl(self, *args, **kwargs):
if self.crawling:
raise RuntimeError("Crawling already taking place")
self.crawling = True
try:
self.spider = self._create_spider(*args, **kwargs)
self.engine = self._create_engine()
start_requests = iter(self.spider.start_requests())
# 开启Engine
yield self.engine.open_spider(self.spider, start_requests)
yield defer.maybeDeferred(self.engine.start)
except Exception:
self.crawling = False
if self.engine is not None:
yield self.engine.close()
raise
scrapy/core/engine.py#ExecutionEngine open_spider(self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True)
# 具体可以参考 Ⅲ.Re:Scrapy架构概述
@inlineCallbacks
def open_spider(self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True):
if self.slot is not None:
raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
logger.info("Spider opened", extra={'spider': spider})
nextcall = CallLaterOnce(self._next_request)
scheduler = create_instance(self.scheduler_cls, settings=None, crawler=self.crawler)
start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
self.spider = spider
if hasattr(scheduler, "open"):
yield scheduler.open(spider)
yield self.scraper.open_spider(spider)
self.crawler.stats.open_spider(spider)
yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
self.slot.nextcall.schedule()
self.slot.heartbeat.start(5)
scrapy/core/engine.py#ExecutionEngine start(self) -> Deferred
# 记录启动时间,创建Deferred()对象,在其结束时优雅的关闭crawler
@inlineCallbacks
def start(self) -> Deferred:
if self.running:
raise RuntimeError("Engine already running")
self.start_time = time()
yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
self.running = True
self._closewait = Deferred()
yield self._closewait
scrapy/core/engine.py#ExecutionEngine close(self) -> Deferred
def close(self) -> Deferred:
"""
Gracefully close the execution engine.
If it has already been started, stop it. In all cases, close the spider and the downloader.
"""
if self.running:
return self.stop() # will also close spider and downloader
if self.spider is not None:
return self.close_spider(self.spider, reason="shutdown") # will also close downloader
return succeed(self.downloader.close())
scrapy/core/engine.py#ExecutionEngine stop(self) -> Deferred
def stop(self) -> Deferred:
"""Gracefully stop the execution engine"""
@inlineCallbacks
def _finish_stopping_engine(_) -> Deferred:
yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
self._closewait.callback(None)
if not self.running:
raise RuntimeError("Engine not running")
self.running = False
dfd = self.close_spider(self.spider, reason="shutdown") if self.spider is not None else succeed(None)
return dfd.addBoth(_finish_stopping_engine)
Ⅴ.Re:Scrapy 之 Engine
https://元气码农少女酱.我爱你/8a9571d79baf/