#1 為什么要用scrapy:
因為自己用python request 庫寫的爬蟲無法獲取到 js 生成的html 文件,html 文件不完整齿税。
#2 scrapy-redis 是什么
scrapy-redis是一個類似 scrapy 的插件段直,scrapy 自動從 redis 中獲取待抓取的鏈接去爬取網(wǎng)頁献酗。簡單易用,可以很快的搭建一個爬蟲分布式框架坷牛。
#3 安裝
# 安裝scrapy
$pip install scrapy
# 安裝scrapy reids
$pip install scrapy-redis
#4 創(chuàng)建工程
$scrapy startproject (path) (name)
#5 scrapy-redis 的配置及使用
略 , 查看源碼地址 https://github.com/rmax/scrapy-redis
#6 將 scrapy 融入到自己的項目中
/yourProjectRoot/MyScrapy/spiders/XXXSpider.py
MyScrapy 是 scrapy 工程文件夾
from scrapy_redis.spiders import RedisSpider
from pathlib import Path
class XXXSpider(RedisSpider):
_headers = {
}
# 爬蟲名稱
name = 'xxxSpider'
# 設(shè)置爬取的 list 的 key
redis_key = "spider-queue:xxxSpider"
def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
# domain = kwargs.pop('domain', '')
# self.allowed_domains = filter(None, domain.split(','))
super(XXXSpider, self).__init__(*args, **kwargs)
def make_requests_from_url(self, url):
"""
對 request 做一些設(shè)置
:param url:
:return:
"""
headers = self._headers
request = super(XXXSpider, self).make_requests_from_url(url)
request.headers = headers
return request
def parse(self, response):
# do what ever you want
pass
@classmethod
def run(cls):
import scrapy.cmdline as sm
# 用 scrapy 的命令庫執(zhí)行
sm.execute(["scrapy", "crawl", "xxxSpider"])
在你的項目中執(zhí)行
XXXSpider.run()
即可運行調(diào)試該spider