# -*- coding: utf-8 -*-#?
在這里定義蜘蛛中間件的模型#?
Define here the models for your spider middleware
## See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
===========================SpiderMiddleware====================== 定義:介于Scrapy引擎和爬蟲(chóng)之間的框架,主要工作是處理蜘蛛的響應(yīng)輸入和請(qǐng)求輸出勃黍。
# Spider Middleware功能:處理爬蟲(chóng)的請(qǐng)求輸入和響應(yīng)輸出
# scrapy已經(jīng)提供了一些直接使用的中間件侨把,他被SPIDER_MIDDLEWARES_BASE定義:
# {# 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,# 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500,# 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700,# 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800,# 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900,# }
# =================SpiderMiddleware類(lèi)==================
class MaoyanSpiderMiddleware(object):?
?????????@classmethod
?????????# 類(lèi)方法耀鸦,參數(shù)crawler瞧哟,可以通過(guò)crawler調(diào)用settings里的全局參數(shù)?
?????????def from_crawler(cls, crawler):?
?????????????????????""" :param crawler: 獲取settings里的全局參數(shù)且轨,如????????????????????crawler.settings.get(參數(shù)) """?
?????????s = cls()
?????????????# 調(diào)用spider_opened函數(shù)進(jìn)行爬取數(shù)據(jù)并對(duì)該函數(shù)發(fā)送該信號(hào)毛肋。該信號(hào)一? ? ? ? ? ? ? ? ? 般用來(lái)分配spider的資源?
?????crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)?
? ? ? ?# 調(diào)用spider_closed函數(shù)進(jìn)行關(guān)閉爬蟲(chóng)并對(duì)該函數(shù)發(fā)送該信號(hào)梨水。該信號(hào)用來(lái)釋放? ? ? spider在spider_opened時(shí)占用的資源。?
?????# crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
?????????return s?
?# 當(dāng)返回來(lái)的response被Spider Middleware處理時(shí)僚饭,該方法被調(diào)用?
?def process_spider_input(self, response, spider):?
?????"""?
?????????????:param response: 被Spider Middleware處理的response對(duì)象?
?????????????:param spider: 返回response對(duì)應(yīng)的spider對(duì)象?
?"""?
?????????????????return None?
?# 當(dāng)spider處理response對(duì)象的結(jié)果后震叮,該方法被調(diào)用?
?def process_spider_output(self, response, result, spider): ????
?????????""" :param response: 被spider處理后得到結(jié)果的response對(duì)象?
? ? ? ? ? ? ? ? :param result: result包含Item或request對(duì)象的可迭代對(duì)象,即spider返回的response結(jié)果?
?????????????????:param spider: 返回response對(duì)象的spider對(duì)象 """
?????????????# 遍歷返回的可迭代對(duì)象?
?????????????for i in result:?
?????????????????????yield i?
?????????# 當(dāng)spider的process_spider_input和process_spider_output發(fā)生異常時(shí)調(diào)用該方法?
def process_spider_exception(self, response, exception, spider):?
?"""
?????????????:param response: 異常被拋出時(shí)被處理的response對(duì)象?
?????????????:param exception: 拋出的異常?
? ? ? ? ? ? ?:param spider: 拋出該異常的spider對(duì)象 """?
? ? ? ? ? ? ? pass 鳍鸵、
# 以spider啟動(dòng)的request為參數(shù)調(diào)用該方法,返回一個(gè)request可迭代對(duì)象?
?def process_start_requests(self, start_requests, spider):?
?"""?
?????????????:param start_requests: 開(kāi)始請(qǐng)求的可迭代對(duì)象?
?????????????:param spider: 開(kāi)始請(qǐng)求所對(duì)應(yīng)的spider對(duì)象 """?
? ? ? ? ? ? ? ? # 遍歷可迭代對(duì)象?
? ? ? ? ? ? ? ? for r in start_requests: yield r?
?# 當(dāng)spider開(kāi)啟時(shí)調(diào)用該函數(shù)苇瓣,說(shuō)明開(kāi)始爬取數(shù)據(jù)并分配spider的資源?
?def spider_opened(self, spider):?
?????????""" :param spider: 開(kāi)始爬取的spider對(duì)象 """?
?????????spider.logger.info('Spider opened: %s' % spider.name)
? ? ? ? ? ?# # 當(dāng)某個(gè)spider被關(guān)閉時(shí),說(shuō)明關(guān)閉該爬蟲(chóng)并釋放spider在spider_opened時(shí)占用的資源权纤。?
?# def spider_closed(self, spider):?
?# """ # :param spider: 開(kāi)始爬取的spider對(duì)象 # """?
?# spider.logger.info('Spider opened:%s'%spider.name)# ======================DownloaderMiddleware=======================定義:位于Scrapy引擎和下載器之間的框架钓简,主要是處理Scrapy引擎與下載器之間的請(qǐng)求及響應(yīng)。見(jiàn)scrapy框架圖
# Downloader Middleware功能:可以修改User-Agent汹想、處理重定向外邓、設(shè)置代理、失敗重試古掏、設(shè)置Cookies等
# scrapy已經(jīng)提供了一些直接使用的中間件损话,他被DOWNLOADER_MIDDLEWARES_BASE定義:
# {# 'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
????# 'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
????#'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
????# 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
????# 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
????#'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
????# 'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
????#'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
????# 'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
????# 'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
????# 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
????# 'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
????# 'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
????# 'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
????# }
# ===============DownloaderMiddleware類(lèi)=================
class MaoyanDownloaderMiddleware(object):?
?????@classmethod?
?????# 類(lèi)方法,參數(shù)crawler槽唾,可以通過(guò)crawler調(diào)用settings里的全局參數(shù)?
?????def from_crawler(cls, crawler):?
?""" :param crawler: 獲取settings里的全局參數(shù)丧枪,如crawler.settings.get(參數(shù)) """?
?????????????????s = cls()?
?????????????????# 調(diào)用spider_opened函數(shù)進(jìn)行爬取數(shù)據(jù)并對(duì)該函數(shù)發(fā)送該信號(hào)。該信號(hào)一般用來(lái)分配spider的資源?
?????????????????crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) # 調(diào)用spider_closed函數(shù)進(jìn)行關(guān)閉爬蟲(chóng)并對(duì)該函數(shù)發(fā)送該信號(hào)庞萍。該信號(hào)用來(lái)釋放spider在spider_opened時(shí)占用的資源拧烦。
?????????????????# crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) ????????????????return s?
?# request被scrapy從調(diào)度器調(diào)度給Downloader Middleware之前調(diào)用該方法對(duì)request對(duì)象進(jìn)行處理?
?????def process_request(self, request, spider):?
?????""" :param request: 就是scrapy從調(diào)度器調(diào)度出來(lái)的request對(duì)象?
?????????:param spider: 就是scrapy調(diào)度出來(lái)的request對(duì)象的spider對(duì)象?
?????"""?
?????????????return None?
?# request對(duì)象被Downloader Middleware執(zhí)行后返回response是才調(diào)用該方法對(duì)response對(duì)象進(jìn)行處理?
?????????def process_response(self, request, response, spider):?
?""" :param request: 調(diào)度出來(lái)被Downloader Middleware處理的request對(duì)象?
?????:param response: Downloader Middleware處理request對(duì)象返回后的response對(duì)象 :param spider: response返回來(lái)的spider對(duì)象?
?????"""?
?????????????????????return response?
?# 當(dāng)process_request和process_response發(fā)生異常時(shí)調(diào)用?
?????def process_exception(self, request, exception, spider):?
?????????????""" :param request: 產(chǎn)生異常的request對(duì)象?
? ? ? ? ? ? ? ? ? :param exception: 拋出的異常對(duì)象?
? ? ? ? ? ? ? ? ? :param spider: 產(chǎn)生異常的request對(duì)象的spider對(duì)象 """?
?????????????????????pass # 當(dāng)spider開(kāi)啟時(shí)調(diào)用該函數(shù),
說(shuō)明開(kāi)始爬取數(shù)據(jù)并分配spider的資源
?????def spider_opened(self, spider):
?""" :param spider: 開(kāi)始爬取的spider對(duì)象 """?
?????????????spider.logger.info('Spider opened: %s' % spider.name)?
?# # 當(dāng)某個(gè)spider被關(guān)閉時(shí)钝计,說(shuō)明關(guān)閉該爬蟲(chóng)并釋放spider在spider_opened時(shí)占用的資源恋博。
?????# def spider_closed(self, spider):?
?????????????# """ # :param spider:
?????????????????????????開(kāi)始爬取的spider對(duì)象 #?
?????????????????""" # spider.logger.info('Spider opened: %s' % spider.name)