Ⅴ.Re:Scrapy 之 Engine

ExecutionEngine

这个 Engine 负责的是系统中所有组件之间的数据流转与交互。

ExecutionEngineEngine的实例,它会初始化ScheduleSlotDownloaderDownloader Middlewares)、Scraper(Spider Middlewaresitempipline)、Stats等,并对其进行管理、控制。

源码解析

scrapy/crawler.py#Crawler crawl(self, *args, **kwargs)

# crawl(self, crawler_or_spidercls, *args, **kwargs) --> 
# _crawl(self, crawler, *args, **kwargs) --> 
# crawl(self, *args, **kwargs)

@defer.inlineCallbacks
def crawl(self, *args, **kwargs):
    if self.crawling:
        raise RuntimeError("Crawling already taking place")
    self.crawling = True
    try:
        self.spider = self._create_spider(*args, **kwargs)
        self.engine = self._create_engine()
        start_requests = iter(self.spider.start_requests())
        # 开启Engine
        yield self.engine.open_spider(self.spider, start_requests)
        yield defer.maybeDeferred(self.engine.start)
    except Exception:
        self.crawling = False
        if self.engine is not None:
            yield self.engine.close()
        raise

scrapy/core/engine.py#ExecutionEngine open_spider(self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True)

# 具体可以参考 Ⅲ.Re:Scrapy架构概述
@inlineCallbacks
def open_spider(self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True):
    if self.slot is not None:
        raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
    logger.info("Spider opened", extra={'spider': spider})
    nextcall = CallLaterOnce(self._next_request)
    scheduler = create_instance(self.scheduler_cls, settings=None, crawler=self.crawler)
    start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
    self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
    self.spider = spider
    if hasattr(scheduler, "open"):
        yield scheduler.open(spider)
    yield self.scraper.open_spider(spider)
    self.crawler.stats.open_spider(spider)
    yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
    self.slot.nextcall.schedule()
    self.slot.heartbeat.start(5)

scrapy/core/engine.py#ExecutionEngine start(self) -> Deferred

# 记录启动时间,创建Deferred()对象,在其结束时优雅的关闭crawler
@inlineCallbacks
def start(self) -> Deferred:
    if self.running:
        raise RuntimeError("Engine already running")
    self.start_time = time()
    yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
    self.running = True
    self._closewait = Deferred()
    yield self._closewait

scrapy/core/engine.py#ExecutionEngine close(self) -> Deferred

def close(self) -> Deferred:
    """
    Gracefully close the execution engine.
    If it has already been started, stop it. In all cases, close the spider and the downloader.
    """
    if self.running:
        return self.stop()  # will also close spider and downloader
    if self.spider is not None:
        return self.close_spider(self.spider, reason="shutdown")  # will also close downloader
    return succeed(self.downloader.close())

scrapy/core/engine.py#ExecutionEngine stop(self) -> Deferred

def stop(self) -> Deferred:
    """Gracefully stop the execution engine"""
    @inlineCallbacks
    def _finish_stopping_engine(_) -> Deferred:
        yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
        self._closewait.callback(None)
    if not self.running:
        raise RuntimeError("Engine not running")
    self.running = False
    dfd = self.close_spider(self.spider, reason="shutdown") if self.spider is not None else succeed(None)
    return dfd.addBoth(_finish_stopping_engine)

Ⅴ.Re:Scrapy 之 Engine
https://元气码农少女酱.我爱你/8a9571d79baf/
作者
元气码农少女酱
发布于
2023年5月2日
许可协议