Ⅱ.Re:Scrapy启动与调试
启动
命令启动
# 运行爬虫程序,不需要项目
# scrapy runspider spider_demo.py
scrapy runspider <spider_file.py>
# 在项目中运行爬虫程序
# scrapy crawl spider_demo
scrapy crawl <spider>
脚本启动
CrawlerProcess 方式
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class MySpider1(scrapy.Spider):
# 省略爬虫逻辑
...
if __name__ == '__main__':
settings = get_project_settings() # 获取项目设置
process = CrawlerProcess(settings) # 加载设置
process.crawl(MySpider1) # 启动爬虫
# process.crawl(MySpider2) # 同一进程中运行多个爬虫
process.start() # 完成最后一次抓取回调后,会在这里阻塞
CrawlerRunner 方式
# 详见 → https://docs.scrapy.org/en/latest/topics/practices.html?highlight=CrawlerProcess#running-multiple-spiders-in-the-same-process
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
class MySpider1(scrapy.Spider):
# Your first spider definition
...
class MySpider2(scrapy.Spider):
# Your second spider definition
...
configure_logging()
settings = get_project_settings()
runner = CrawlerRunner(settings)
@defer.inlineCallbacks
def crawl():
yield runner.crawl(MySpider1)
yield runner.crawl(MySpider2)
reactor.stop()
crawl()
reactor.run() # the script will block here until the last crawl call is finished
调试
scrapy shell
的方式简单调试IDE
打断点,用脚本启动程序即可
运行
# scrapy/crawler.py#CrawlerProcess
def start(self, stop_after_crawl=True, install_signal_handlers=True):
# 不在最顶层的模块内引入reactor是避免安装默认reactor
from twisted.internet import reactor
# reactor是twisted的单例,是事件管理器,用于注册、注销事件,运行事件循环,当事件发生时调用回调函数处理
if stop_after_crawl:
d = self.join()
# Don't start the reactor if the deferreds are already fired
if d.called:
return
# stop_after_crawl:所有爬虫完成后是否停止reactor
d.addBoth(self._stop_reactor)
if install_signal_handlers:
install_shutdown_handlers(self._signal_shutdown)
resolver_class = load_object(self.settings["DNS_RESOLVER"])
resolver = create_instance(resolver_class, self.settings, self, reactor=reactor)
resolver.install_on_reactor()
tp = reactor.getThreadPool()
tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
# 运行事件循环
reactor.run(installSignalHandlers=False) # blocking call
Twisted
- Twisted的reactor只有通过调用reactor.run()来启动
- reactor循环是在其开始的进程中运行,也就是运行在主进程中
- 一旦启动,就会一直运行下去,reactor就会在程序的控制下(或者具体在一个启动它的线程的控制下)
- reactor循环并不会消耗任何CPU的资源
- 并不需要显式的创建reactor,只需要引入就OK了
Ⅱ.Re:Scrapy启动与调试
https://元气码农少女酱.我爱你/fd2edbb72b97/