scrapy框架是爬蟲(chóng)界最為強(qiáng)大的框架碎赢,沒(méi)有之一,它的強(qiáng)大在于它的高可擴(kuò)展性和低耦合速梗,使使用者能夠輕松的實(shí)現(xiàn)更改和補(bǔ)充肮塞。
其中內(nèi)置三種爬蟲(chóng)主程序模板襟齿,scrapy.Spider、RedisSpider枕赵、CrawlSpider猜欺、RedisCrawlSpider(深度分布式爬蟲(chóng))分別為別為一般爬蟲(chóng)、分布式爬蟲(chóng)拷窜、深度爬蟲(chóng)提供內(nèi)部邏輯开皿;下面將從源碼和應(yīng)用來(lái)學(xué)習(xí),
scrapy.Spider
源碼:
"""
Base class for Scrapy spiders
See documentation in docs/topics/spiders.rst
"""
import logging
import warnings
from scrapy import signals
from scrapy.http import Request
from scrapy.utils.trackref import object_ref
from scrapy.utils.url import url_is_from_spider
from scrapy.utils.deprecate import create_deprecated_class
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.deprecate import method_is_overridden
class Spider(object_ref):
"""Base class for scrapy spiders. All spiders must inherit from this
class.
"""
name = None
custom_settings = None
def __init__(self, name=None, **kwargs):
if name is not None:
self.name = name
elif not getattr(self, 'name', None):
raise ValueError("%s must have a name" % type(self).__name__)
self.__dict__.update(kwargs)
if not hasattr(self, 'start_urls'):
self.start_urls = []
@property
def logger(self):
logger = logging.getLogger(self.name)
return logging.LoggerAdapter(logger, {'spider': self})
def log(self, message, level=logging.DEBUG, **kw):
"""Log the given message at the given log level
This helper wraps a log call to the logger within the spider, but you
can use it directly (e.g. Spider.logger.info('msg')) or use any other
Python logger too.
"""
self.logger.log(level, message, **kw)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = cls(*args, **kwargs)
spider._set_crawler(crawler)
return spider
def set_crawler(self, crawler):
warnings.warn("set_crawler is deprecated, instantiate and bound the "
"spider to this crawler with from_crawler method "
"instead.",
category=ScrapyDeprecationWarning, stacklevel=2)
assert not hasattr(self, 'crawler'), "Spider already bounded to a " \
"crawler"
self._set_crawler(crawler)
def _set_crawler(self, crawler):
self.crawler = crawler
self.settings = crawler.settings
crawler.signals.connect(self.close, signals.spider_closed)
def start_requests(self):
cls = self.__class__
if method_is_overridden(cls, Spider, 'make_requests_from_url'):
warnings.warn(
"Spider.make_requests_from_url method is deprecated; it "
"won't be called in future Scrapy releases. Please "
"override Spider.start_requests method instead (see %s.%s)." % (
cls.__module__, cls.__name__
),
)
for url in self.start_urls:
yield self.make_requests_from_url(url)
else:
for url in self.start_urls:
yield Request(url, dont_filter=True)
def make_requests_from_url(self, url):
""" This method is deprecated. """
return Request(url, dont_filter=True)
def parse(self, response):
raise NotImplementedError('{}.parse callback is not defined'.format(self.__class__.__name__))
@classmethod
def update_settings(cls, settings):
settings.setdict(cls.custom_settings or {}, priority='spider')
@classmethod
def handles_request(cls, request):
return url_is_from_spider(request.url, cls)
@staticmethod
def close(spider, reason):
closed = getattr(spider, 'closed', None)
if callable(closed):
return closed(reason)
def __str__(self):
return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
__repr__ = __str__
BaseSpider = create_deprecated_class('BaseSpider', Spider)
class ObsoleteClass(object):
def __init__(self, message):
self.message = message
def __getattr__(self, name):
raise AttributeError(self.message)
spiders = ObsoleteClass(
'"from scrapy.spider import spiders" no longer works - use '
'"from scrapy.spiderloader import SpiderLoader" and instantiate '
'it with your project settings"'
)
# Top-level imports
from scrapy.spiders.crawl import CrawlSpider, Rule
from scrapy.spiders.feed import XMLFeedSpider, CSVFeedSpider
from scrapy.spiders.sitemap import SitemapSpider
其中需要關(guān)注的是name(爬蟲(chóng)名字)篮昧、start_urls(抓取的起始url列表)赋荆、allowed_domains(限定抓取的url所在域名)、start_requests(開(kāi)始抓取的方法)
name懊昨、start_urls窄潭、allowed_domains是屬性,在創(chuàng)建創(chuàng)建項(xiàng)目的時(shí)候已經(jīng)建好了酵颁,稍作修改即可嫉你。start_requests是起始的抓取方法,一般是默認(rèn)的遍歷start_urls列表生成Request對(duì)象材义,在scrapy中需要登錄的時(shí)候可以復(fù)寫該方法均抽,這個(gè)比較簡(jiǎn)單不在贅述。
CrawlSpider
深度爬蟲(chóng)其掂,根據(jù)連接提取規(guī)則油挥,會(huì)自動(dòng)抓取頁(yè)面中滿足規(guī)則的連接,然后再請(qǐng)求解析款熬,再抓取從而一直深入深寥。
源碼
"""
This modules implements the CrawlSpider which is the recommended spider to use
for scraping typical web sites that requires crawling pages.
See documentation in docs/topics/spiders.rst
"""
import copy
import six
from scrapy.http import Request, HtmlResponse
from scrapy.utils.spider import iterate_spider_output
from scrapy.spiders import Spider
def identity(x):
return x
class Rule(object):
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity):
self.link_extractor = link_extractor
self.callback = callback
self.cb_kwargs = cb_kwargs or {}
self.process_links = process_links
self.process_request = process_request
if follow is None:
self.follow = False if callback else True
else:
self.follow = follow
class CrawlSpider(Spider):
rules = ()
def __init__(self, *a, **kw):
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()
def parse(self, response):
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
def parse_start_url(self, response):
return []
def process_results(self, response, results):
return results
def _build_request(self, rule, link):
r = Request(url=link.url, callback=self._response_downloaded)
r.meta.update(rule=rule, link_text=link.text)
return r
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def _response_downloaded(self, response):
rule = self._rules[response.meta['rule']]
return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for requests_or_item in iterate_spider_output(cb_res):
yield requests_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
def _compile_rules(self):
def get_method(method):
if callable(method):
return method
elif isinstance(method, six.string_types):
return getattr(self, method, None)
self._rules = [copy.copy(r) for r in self.rules]
for rule in self._rules:
rule.callback = get_method(rule.callback)
rule.process_links = get_method(rule.process_links)
rule.process_request = get_method(rule.process_request)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
spider._follow_links = crawler.settings.getbool(
'CRAWLSPIDER_FOLLOW_LINKS', True)
return spider
def set_crawler(self, crawler):
super(CrawlSpider, self).set_crawler(crawler)
self._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)
CrawlSpider是繼承于Spider,也實(shí)現(xiàn)了其中的常用屬性和方法贤牛,新增了一個(gè)rules屬性(連接提取規(guī)則集合)惋鹅,但是不同的是Crawl內(nèi)部實(shí)現(xiàn)了parse解析方法,不能在Crawl中使用該關(guān)鍵詞殉簸。
def parse(self, response):
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
也提供了一個(gè)可復(fù)寫(overrideable)的方法:
parse_start_url(response)
- 當(dāng)start_url的請(qǐng)求返回時(shí)闰集,該方法被調(diào)用。 該方法分析最初的返回值并必須返回一個(gè)
Item
對(duì)象或者 一個(gè)Request
對(duì)象或者 一個(gè)可迭代的包含二者對(duì)象般卑。
rules
在rules中包含一個(gè)或多個(gè)Rule對(duì)象武鲁,每個(gè)Rule對(duì)爬取網(wǎng)站的動(dòng)作定義了特定操作。如果多個(gè)rule匹配了相同的鏈接蝠检,則根據(jù)規(guī)則在本集合中被定義的順序沐鼠,第一個(gè)會(huì)被使用。
class scrapy.spiders.Rule(
link_extractor,
callback = None,
cb_kwargs = None,
follow = None,
process_links = None,
process_request = None
)
-
link_extractor
:是一個(gè)Link Extractor對(duì)象,用于定義需要提取的鏈接(Link Extractor對(duì)象見(jiàn)下)饲梭。 -
callback
: 從link_extractor中每獲取到鏈接時(shí)乘盖,參數(shù)所指定的值作為回調(diào)函數(shù),該回調(diào)函數(shù)接受一個(gè)response作為其第一個(gè)參數(shù)憔涉。
注意:當(dāng)編寫爬蟲(chóng)規(guī)則時(shí)订框,避免使用parse作為回調(diào)函數(shù)。由于CrawlSpider使用parse方法來(lái)實(shí)現(xiàn)其邏輯兜叨,如果覆蓋了 parse方法布蔗,crawl spider將會(huì)運(yùn)行失敗。 -
follow
:是一個(gè)布爾(boolean)值浪腐,指定了根據(jù)該規(guī)則從response提取的鏈接是否需要跟進(jìn)。 如果callback為None顿乒,follow 默認(rèn)設(shè)置為True 议街,否則默認(rèn)為False。 -
process_links
:指定該spider中哪個(gè)的函數(shù)將會(huì)被調(diào)用璧榄,從link_extractor中獲取到鏈接列表時(shí)將會(huì)調(diào)用該函數(shù)特漩。該方法主要用來(lái)過(guò)濾。 -
process_request
:指定該spider中哪個(gè)的函數(shù)將會(huì)被調(diào)用骨杂, 該規(guī)則提取到每個(gè)request時(shí)都會(huì)調(diào)用該函數(shù)涂身。 (用來(lái)過(guò)濾request)
LinkExtractors
class scrapy.linkextractors.LinkExtractor
Link Extractors 的目的很簡(jiǎn)單: 提取鏈接?
每個(gè)LinkExtractor有唯一的公共方法是 extract_links(),它接收一個(gè) Response 對(duì)象搓蚪,并返回一個(gè) scrapy.link.Link 對(duì)象蛤售。
Link Extractors要實(shí)例化一次,并且 extract_links 方法會(huì)根據(jù)不同的 response 調(diào)用多次提取鏈接?
class scrapy.linkextractors.LinkExtractor(
allow = (),
deny = (),
allow_domains = (),
deny_domains = (),
deny_extensions = None,
restrict_xpaths = (),
tags = ('a','area'),
attrs = ('href'),
canonicalize = True,
unique = True,
process_value = None
)
主要參數(shù):
-
allow
:滿足括號(hào)中“正則表達(dá)式”的值會(huì)被提取妒潭,如果為空悴能,則全部匹配。 -
deny
:與這個(gè)正則表達(dá)式(或正則表達(dá)式列表)不匹配的URL一定不提取雳灾。 -
allow_domains
:會(huì)被提取的鏈接的domains漠酿。 -
deny_domains
:一定不會(huì)被提取鏈接的domains。 -
restrict_xpaths
:使用xpath表達(dá)式谎亩,和allow共同作用過(guò)濾鏈接炒嘲。
案例
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TestSpider(CrawlSpider):
name = 'Test'
allowed_domains = ['Test.com']
start_urls = ['http://Test.com/']
rules = (
Rule(LinkExtractor(allow=r'Items/'), callback='parse_test', follow=True),
)
def parse_test(self, response):
items = {}
············
return items
RedisSpider、RedisCrawlSpider
Scrapy-redis提供了下面四種組件:
Scheduler(調(diào)度程序)
Duplication Filter(過(guò)濾)
Item Pipeline(數(shù)據(jù)管道)
Base Spider(爬蟲(chóng)基類)
Scheduler
:
Scrapy中跟“待爬隊(duì)列”直接相關(guān)的就是調(diào)度器Scheduler
匈庭,它負(fù)責(zé)對(duì)新的request進(jìn)行入列操作(加入Scrapy queue)夫凸,取出下一個(gè)要爬取的request(從Scrapy queue中取出)等操作。它把待爬隊(duì)列按照優(yōu)先級(jí)建立了一個(gè)字典結(jié)構(gòu)嚎花,比如:
{
優(yōu)先級(jí)0 : 隊(duì)列0
優(yōu)先級(jí)1 : 隊(duì)列1
優(yōu)先級(jí)2 : 隊(duì)列2
}
然后根據(jù)request中的優(yōu)先級(jí)寸痢,來(lái)決定該入哪個(gè)隊(duì)列,出列時(shí)則按優(yōu)先級(jí)較小的優(yōu)先出列紊选。為了管理這個(gè)比較高級(jí)的隊(duì)列字典啼止,Scheduler需要提供一系列的方法道逗。但是原來(lái)的Scheduler已經(jīng)無(wú)法使用,所以使用Scrapy-redis的scheduler組件献烦。
Duplication Filter
Scrapy中用集合實(shí)現(xiàn)這個(gè)request去重功能滓窍,Scrapy中把已經(jīng)發(fā)送的request指紋放入到一個(gè)集合中,把下一個(gè)request的指紋拿到集合中比對(duì)巩那,如果該指紋存在于集合中吏夯,說(shuō)明這個(gè)request發(fā)送過(guò)了,如果沒(méi)有則繼續(xù)操作即横。這個(gè)核心的判重功能是這樣實(shí)現(xiàn)的:
def request_seen(self, request):
# self.request_figerprints就是一個(gè)指紋集合
fp = self.request_fingerprint(request)
# 這就是判重的核心操作
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
if self.file:
self.file.write(fp + os.linesep)
在scrapy-redis中去重是由Duplication Filter
組件來(lái)實(shí)現(xiàn)的噪生,它通過(guò)redis的set 不重復(fù)的特性,巧妙的實(shí)現(xiàn)了Duplication Filter去重东囚。scrapy-redis調(diào)度器從引擎接受request跺嗽,將request的指紋存?redis的set檢查是否重復(fù),并將不重復(fù)的request push寫?redis的 request queue页藻。
引擎請(qǐng)求request(Spider發(fā)出的)時(shí)桨嫁,調(diào)度器從redis的request queue隊(duì)列?里根據(jù)優(yōu)先級(jí)pop 出?個(gè)request 返回給引擎,引擎將此request發(fā)給spider處理份帐。
Item Pipeline
:
引擎將(Spider返回的)爬取到的Item給Item Pipeline璃吧,scrapy-redis 的Item Pipeline將爬取到的 Item 存?redis的 items queue。
修改過(guò)Item Pipeline
可以很方便的根據(jù) key 從 items queue 提取item废境,從?實(shí)現(xiàn) items processes
集群畜挨。
Base Spider
不在使用scrapy原有的Spider類,重寫的RedisSpider
繼承了Spider和RedisMixin這兩個(gè)類彬坏,RedisMixin是用來(lái)從redis讀取url的類朦促。
當(dāng)我們生成一個(gè)Spider繼承RedisSpider時(shí),調(diào)用setup_redis函數(shù)栓始,這個(gè)函數(shù)會(huì)去連接redis數(shù)據(jù)庫(kù)务冕,然后會(huì)設(shè)置signals(信號(hào)):
- 一個(gè)是當(dāng)spider空閑時(shí)候的signal,會(huì)調(diào)用spider_idle函數(shù)幻赚,這個(gè)函數(shù)調(diào)用
schedule_next_request
函數(shù)禀忆,保證spider是一直活著的狀態(tài),并且拋出DontCloseSpider異常落恼。 - 一個(gè)是當(dāng)抓到一個(gè)item時(shí)的signal箩退,會(huì)調(diào)用item_scraped函數(shù),這個(gè)函數(shù)會(huì)調(diào)用
schedule_next_request
函數(shù)佳谦,獲取下一個(gè)request戴涝。
from scrapy import signals
from scrapy.exceptions import DontCloseSpider
from scrapy.spiders import Spider, CrawlSpider
from . import connection, defaults
from .utils import bytes_to_str
class RedisMixin(object):
"""Mixin class to implement reading urls from a redis queue."""
redis_key = None
redis_batch_size = None
redis_encoding = None
# Redis client placeholder.
server = None
def start_requests(self):
"""Returns a batch of start requests from redis."""
return self.next_requests()
def setup_redis(self, crawler=None):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
"""
if self.server is not None:
return
if crawler is None:
# We allow optional crawler argument to keep backwards
# compatibility.
# XXX: Raise a deprecation warning.
crawler = getattr(self, 'crawler', None)
if crawler is None:
raise ValueError("crawler is required")
settings = crawler.settings
if self.redis_key is None:
self.redis_key = settings.get(
'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
)
self.redis_key = self.redis_key % {'name': self.name}
if not self.redis_key.strip():
raise ValueError("redis_key must not be empty")
if self.redis_batch_size is None:
# TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
self.redis_batch_size = settings.getint(
'REDIS_START_URLS_BATCH_SIZE',
settings.getint('CONCURRENT_REQUESTS'),
)
try:
self.redis_batch_size = int(self.redis_batch_size)
except (TypeError, ValueError):
raise ValueError("redis_batch_size must be an integer")
if self.redis_encoding is None:
self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)
self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
self.__dict__)
self.server = connection.from_settings(crawler.settings)
# The idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def next_requests(self):
"""Returns a request to be scheduled or none."""
use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
fetch_one = self.server.spop if use_set else self.server.lpop
# XXX: Do we need to use a timeout here?
found = 0
# TODO: Use redis pipeline execution.
while found < self.redis_batch_size:
data = fetch_one(self.redis_key)
if not data:
# Queue empty.
break
req = self.make_request_from_data(data)
if req:
yield req
found += 1
else:
self.logger.debug("Request not made from data: %r", data)
if found:
self.logger.debug("Read %s requests from '%s'", found, self.redis_key)
def make_request_from_data(self, data):
"""Returns a Request instance from data coming from Redis.
By default, ``data`` is an encoded URL. You can override this method to
provide your own message decoding.
Parameters
----------
data : bytes
Message from redis.
"""
url = bytes_to_str(data, self.redis_encoding)
return self.make_requests_from_url(url)
def schedule_next_requests(self):
"""Schedules a request if available"""
# TODO: While there is capacity, schedule a batch of redis requests.
for req in self.next_requests():
self.crawler.engine.crawl(req, spider=self)
def spider_idle(self):
"""Schedules a request if available, otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider
class RedisSpider(RedisMixin, Spider):
"""Spider that reads urls from redis queue when idle.
Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue.
Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: False)
Use SET operations to retrieve messages from the redis queue. If False,
the messages are retrieve using the LPOP command.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue.
"""
@classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj
class RedisCrawlSpider(RedisMixin, CrawlSpider):
"""Spider that reads urls from redis queue when idle.
Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue.
Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: True)
Use SET operations to retrieve messages from the redis queue.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue.
"""
@classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj
在scrapy_redis組件中不僅提供了RedisSpider還提供了兼具深度爬蟲(chóng)的RedisCrawlSpider,至于其余幾個(gè)Redis分布式組件將在后面逐一分享。
Redis分布式組件啥刻,新增redis_key 屬性奸鸯,用于早redis中去重和數(shù)據(jù)存儲(chǔ)。
示例
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
class TestSpider(RedisCrawlSpider):
name = 'test'
allowed_domains = ['www.test.com']
redis_key = 'testspider:start_urls'
rules = [
# 獲取每一頁(yè)的鏈接
Rule(link_extractor=LinkExtractor(allow=('/?page=\d+'))),
# 獲取每一個(gè)公司的詳情
Rule(link_extractor=LinkExtractor(allow=('/\d+')), callback='parse_item')
]
def parse_item(self, response):
······
return item
至于更多配置不再贅述可帽, 后續(xù)將對(duì)一些組件繼續(xù)深入分析娄涩。