一奕扣、CrawlSpider模板
- 創(chuàng)建項(xiàng)目
scrapy startproject 項(xiàng)目名稱
- 查看模板
scrapy genspider -l
- 創(chuàng)建crawl模板
scrapy genspider -t crawl 爬蟲名稱 地址
二、Spider爬蟲
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
# 導(dǎo)入LinkExtractor用于提取鏈接
from scrapy.spiders import CrawlSpider, Rule
# Rule定義一個規(guī)則扁达,然后讓LinkExtractor取根據(jù)這些規(guī)則提取url
from CrawlSpiderDemo.items import CrawlspiderdemoItem
# 在scrapy框架中包了兩個分類的爬蟲分別是:Spider(基本爬蟲)和CrawlSpider(增量模板爬蟲)
# CrawlSpider是Spider的一個派生類正卧,spider類設(shè)計原則只從start_urls列表中提取內(nèi)容,CrawlSpider定義了一些規(guī)則跪解,這些規(guī)則可以跟蹤鏈接炉旷,從而可以使得一個頁面中所有的符合規(guī)則的鏈接都被提取出來放入調(diào)度器中
# 在不斷訪問url的過程中,爬蟲匹配到的url越來越多
class DushuSpider(CrawlSpider):
name = 'dushu'
allowed_domains = ['dushu.com']
start_urls = ['https://www.dushu.com/book/1002.html']
rules = (
Rule(LinkExtractor(allow=r'/book/1002_\d+\.html'), callback='parse_item', follow=True),
)
# rules 規(guī)則: 包含若干個Rule對象叉讥,每一個Rule對象對我們爬取網(wǎng)站的規(guī)則都做了一些特定的操作窘行,根據(jù)LinkExtractor里面的規(guī)則提取出所有的鏈接,然后把這些鏈接通過引擎壓入調(diào)度器的調(diào)度隊(duì)列中图仓,調(diào)度器進(jìn)而去調(diào)度下載罐盔,然后回調(diào)parse_item (這里的回調(diào)方法寫成了字符串形式) ,再從二次請求的這些url對應(yīng)的頁面中根據(jù)LinkExtractor的規(guī)則繼續(xù)匹配(如果有重復(fù)救崔,自動剔除)翘骂,依次類推壁熄,直到匹配到所有的頁面
# LinkExtractor的匹配規(guī)則:
# 用正則表達(dá)式來匹配:LinkExtractor(allow="某正則") # /book/1002_\d\.html
# 用xpath匹配:LinkExtractor(restrict_xpath="某xpath路徑")
# 用css選擇器:LinkExtractor(restrict_css="某css選擇器")
def parse_item(self, response):
print(response.url)
# 解析頁面
book_list = response.xpath("http://div[@class='bookslist']//li")
for book in book_list:
item = CrawlspiderdemoItem()
item["book_name"] = book.xpath(".//h3/a/text()").extract_first()
# 獲取到二級頁面的url
next_url = "https://www.dushu.com" + book.xpath(".//h3/a/@href").extract_first()
yield scrapy.Request(url=next_url,callback=self.parse_next,meta={"item":item})
def parse_next(self, response):
item = response.meta["item"]
item["price"] = response.xpath("http://span[@class='num']/text()").extract_first()
m = response.xpath("http://div[@class='text txtsummary']")[2]
item["mulu"] = m.xpath(".//text()").extract()
yield item
使用xpath或其他規(guī)則匹配下來的所有節(jié)點(diǎn),返回的類型是列表類型
.extract()方法是提取它的內(nèi)容
.extract_first()方法是提取列表第一個內(nèi)容碳竟,若列表為空返回空草丧,而不會報錯
三、Ip代理設(shè)置
- settings.py 設(shè)置
IPPOOL = [
{'ip':'113.16.160.101:8118'},
{'ip':'119.31.210.170:7777'},
{'ip':'183.129.207.83:10800'},
# {'ip':''},
# {'ip':''},
# {'ip':''},
# {'ip':''},
# {'ip':''},
]
# 下載中間件設(shè)置
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123,
'IpAgent.middlewares.IPPOOLS': 125,
}
定義一個字段莹桅,表示我們收集好的代理
- middlewares.py
# 從settings文件中導(dǎo)入IPPOOL
import random
from .settings import IPPOOL
# 導(dǎo)入官方文檔對應(yīng)的HttpProxyMiddleware
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
# 創(chuàng)建一個代理中間件類集成自官方代理中間件
class IPPOOLS(HttpProxyMiddleware):
# 重寫初始化方法
def __init__(self,ip=''):
self.ip = ip
# 重寫請求處理方法
def process_request(self, request, spider):
# 從ip代理池中隨機(jī)挑選一個ip地址
current_ip = random.choice(IPPOOL)
print('當(dāng)前ip是:',current_ip['ip'])
# 設(shè)置請求對象的代理服務(wù)器是當(dāng)前ip
request.meta['proxy'] = 'https://' + current_ip['ip']
# 此時就可以把代理ip植入到下載器中
四昌执、動態(tài)頁面請求之selenium
- settings.py設(shè)置
# 下載中間件設(shè)置
DOWNLOADER_MIDDLEWARES = {
'Toutiao.middlewares.ToutiaoDownloaderMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None,
}
- middlewares.py設(shè)置
from scrapy import signals
from selenium import webdriver
from time import sleep
from scrapy.http import HtmlResponse
class ToutiaoDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# 創(chuàng)建一個webdriver對象
opt = webdriver.ChromeOptions()
opt.add_argument("--headless")
driver = webdriver.Chrome(options=opt)
driver.get(request.url)
sleep(3)
# 讓頁面滾動
js = "var q = document.documentElement.scrollTop=%d"
distance = 100
for i in range(100):
driver.execute_script(js%distance)
distance += 100
sleep(0.5)
body = driver.page_source
print("正在使用中間件下載...")
print("當(dāng)前瀏覽器正在訪問的網(wǎng)址是:",driver.current_url)
# 響應(yīng)體需要重新定義
res = HtmlResponse(url=driver.current_url,body=body,encoding='utf-8',request=request)
return res
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
五、基本分布式爬蟲部署redis儲存
scrapy_redis組件
pip install scrapy_redis
1诈泼、scrapy和scrapy_redis的區(qū)別
scrapy是一個通用的爬蟲框架懂拾,不支持分布式
scrapy_redis就是為實(shí)現(xiàn)scrapy的分布式而誕生的,它里面提功了redis的組件铐达,通過這些redis組件岖赋,就可以實(shí)現(xiàn)分布式
2、部署分布式
服務(wù)器端(master端):
可以用某一臺主機(jī)作為redis服務(wù)器的運(yùn)行方(即服務(wù)端)瓮孙,也稱為master
客戶端(slaver端):
1)把普通爬蟲修改成分布式唐断,去掉start_urls(不讓slaver隨意的執(zhí)行),替換成redis_key(為了讓master能夠控制slaver的爬去)
- settings.py 設(shè)置
配置管道中間件
ITEM_PIPELINES = {
# 分布式的爬蟲的數(shù)據(jù)可以不通過本地的管道(數(shù)據(jù)不需要往本地存)杭抠,數(shù)據(jù)需要存在redis數(shù)據(jù)庫中脸甘,在這里需要加入一個redis數(shù)據(jù)庫的管道組件
"scrapy_redis.pipelines.RedisPipeline": 400
}
# 指定Redis數(shù)據(jù)庫相關(guān)配置
# Redis的主機(jī)地址
REDIS_HOST = "134.175.114.102"
# 端口號
REDIS_PORT = 6379
# 密碼
# REDIS_PARAMS = {"password":'xxxx'}
# 1、調(diào)度器需要切換成Scrapy_Redis的調(diào)度器(這個調(diào)度器是Scrapy_Redis組件對scrapy原生調(diào)度器的重寫偏灿,加入一些分布式調(diào)度的算法)
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 2丹诀、加入scrapy_redis的去重組件
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 3、爬取過程中是否允許暫停
SCHEDULER_PERSIST = True
- spider設(shè)置
from scrapy_redis.spiders import RedisCrawlSpider
class ReadbookSpider(RedisCrawlSpider): # 注意繼承RedisCrawlSpider
name = 'Readbook'
allowed_domains = ['www.dushu.com']
# start_urls = ['http://www.dushu.com/book/1002.html']
# start_urls = ['https://www.dushu.com/book/1002.html'] # 分布式的爬蟲所有的url都是從redis數(shù)據(jù)庫的相關(guān)鍵下面提取
# redis_key這個屬性指定了分布式爬蟲在獲取url的時候從哪些鍵中獲取的
redis_key = "dushu:start_urls"
rules = (
Rule(LinkExtractor(allow=r'/book/1002_?\d*\.html'), callback='parse_item', follow=True),
)