-
scrapy執(zhí)行邏輯詳細圖
1.下載中間件downloader
1.1 下載中間件
- settings配置
DOWNLOADER_MIDDLEWARES = {
# 'TestSpider.middlewares.TestspiderDownloaderMiddleware': 543,
# 'TestSpider.middlewares.Test1Middleware': 543,
'TestSpider.middlewares.Test2Middleware': 543,
}
- 重寫下載中間件的方法
class Test1Middleware():
def process_request(self, request, spider):
# 返回None,表示繼續(xù)執(zhí)行其他中間件的process_request方法
# 如果最后一個中間件的process_request方法還是返回None但校,
# 則表示取調用下載器進行下載請求
return None
# 返回Response, 表示不去調用下載器進行下載請求诲锹,
# 而是直接返回響應內(nèi)容給解析方法parse(response)
# return Response(url='http://www.baidu.com', body='12345')
# 返回Request,表示發(fā)送請求給調度器進行下載,不建議使用
# return Request(url='http://www.baidu.com',
# callback=self.parse,
# dont_filter=True)
# if request.url != 'http://www.baidu.com':
# return Request(url='http://www.baidu.com')
def process_response(self, request, response, spider):
# 修改響應內(nèi)容response
response.status = 201
return response
def process_exception(self, request, excepition, spider):
print('異常處理')
# return None
# 換ip
# request.meta['proxy'] = 'http://'
return request
- 該user-agent和ip
class Test2Middleware():
def process_request(self, request, spider):
# 設置ip
# request.meta['proxy'] = 'http://122.117.65.107:52851'
# 設置頭部User-Agent
ua = UserAgent()
request.headers['User-Agent'] = ua.random
return None
def process_response(self, request, response, spider):
return response
1.2 爬蟲中間件
- settings配置
SPIDER_MIDDLEWARES = {
# 'TestSpider.middlewares.TestspiderSpiderMiddleware': 543,
'TestSpider.middlewares.BiqugeSpiderMiddleware': 543,
}
- 爬蟲返回請求request和item時涉馅,都會被調用的方法
class BiqugeSpiderMiddleware():
def process_spider_output(self, response, result, spider):
# 爬蟲返回請求request和item時归园,都會被調用的方法
for i in result:
# 爬蟲返回Request對象時
if isinstance(i, Request):
yield i
# 爬蟲返回Item對象時
if isinstance(i, BiqugeSpiderItem):
# TODO:處理i的內(nèi)容
# count = 0
i['content'] = str(i['content']).replace('\\xa0', '')
i['content'] = i['content'].replace('\\r', '').replace("', '',", '').replace('"', '')
temp = i['content']
i['content'] = ''
for x in temp:
if x != '[' and x != ']' and x != "'":
i['content'] += x
print(i['content'])
# print(count)
yield i
2. 連接數(shù)據(jù)庫
鏈接數(shù)據(jù)庫準備
- 在items文件中寫items模型
class BiqugeSpiderItem(scrapy.Item):
content = scrapy.Field()
name = scrapy.Field()
- 在spider文件中生成item對象
def parse_detail(self, response):
sel = Selector(response)
item = BiqugeSpiderItem()
# 解析方法, 解析content內(nèi)容時稚矿,可以對結果進行處理庸诱,在返回實體
item['content'] = sel.xpath('//*[@id="content"]/text()').extract()
item['name'] = sel.xpath('//*[@class="content"]/h1/text()').extract_first()
yield item
2.1 連接mongodb
- settings配置pipelines
ITEM_PIPELINES = {
# 'TestSpider.pipelines.TestspiderPipeline': 300,
'TestSpider.pipelines.MongoDBPipeline': 300,
# 'TestSpider.pipelines.MysqlPipeline': 300,
}
- 添加settings參數(shù)
# Mongo配置
MongoDB_HOST = '127.0.0.1'
MongoDB_PORT = 27017
MongoDB_PASSWORD = '123456'
MongoDB_DB = 'spider'
- piplines數(shù)據(jù)庫連接
class MongoDBPipeline():
# 持久化數(shù)據(jù)
def __init__(self, mongo_host, mongo_port, mongo_password, mongo_db):
self.mongo_host = mongo_host
self.mongo_port = mongo_port
self.mongo_password = mongo_password
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
# 返回mongodbpipeline對象
return cls(
mongo_host=MongoDB_HOST,
mongo_port=MongoDB_PORT,
mongo_password=MongoDB_PASSWORD,
mongo_db = MongoDB_DB
)
def open_spider(self, spider):
# 鏈接mongdb
self.client = pymongo.MongoClient(host=self.mongo_host,
port=self.mongo_port,
password=self.mongo_password)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
# 關閉鏈接
self.client.close()
def process_item(self, item, spider):
# 將item數(shù)據(jù)保存在mongo中
# type(item) - item是一個對象
# <class 'TestSpider.items.BiqugeSpiderItem'>
self.db['biquge'].insert_one(dict(item))
return item
2.2 連接mysqldb
- settings配置pipelines
ITEM_PIPELINES = {
# 'TestSpider.pipelines.TestspiderPipeline': 300,
# 'TestSpider.pipelines.MongoDBPipeline': 300,
'TestSpider.pipelines.MysqlPipeline': 300,
}
- 添加settings參數(shù)
# msyql配置
MYSQL_HOST = '127.0.0.1'
MYSQL_PORT = 3306
MYSQL_PASSWORD = '960218'
MYSQL_USER = 'root'
MYSQL_DB = 'spider'
- piplines數(shù)據(jù)庫連接
class MysqlPipeline():
def __init__(self, host, port, user, password, database):
self.host = host
self.port = port
self.user = user
self.password = password
self.database = database
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get("MYSQL_HOST"),
port=crawler.settings.get("MYSQL_PORT"),
user=crawler.settings.get("MYSQL_USER"),
password=crawler.settings.get("MYSQL_PASSWORD"),
database=crawler.settings.get("MYSQL_DB"),
)
def open_spider(self, spider):
# 鏈接數(shù)據(jù)庫
self.db = pymysql.connect(host=self.host,
port=self.port,
user=self.user,
password=self.password,
db=self.database,
charset='utf8')
self.cursor = self.db.cursor()
def close_spider(self, spider):
self.db.close()
def process_item(self, item, spider):
sql = "insert into biquge(content, name) values(('%s'),('%s'))" % (item['content'], item['name'])
print(sql)
self.cursor.execute(sql)
self.db.commit()
return item
注意:下面?zhèn)z種寫法
from TestSpider.settings import MongoDB_HOST, MongoDB_PORT, MongoDB_PASSWORD, MongoDB_DB
@classmethod
def from_crawler(cls, crawler):
# 返回mongodbpipeline對象
return cls(
mongo_host=MongoDB_HOST,
mongo_port=MongoDB_PORT,
mongo_password=MongoDB_PASSWORD,
mongo_db = MongoDB_DB
)
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get("MYSQL_HOST"),
port=crawler.settings.get("MYSQL_PORT"),
user=crawler.settings.get("MYSQL_USER"),
password=crawler.settings.get("MYSQL_PASSWORD"),
database=crawler.settings.get("MYSQL_DB"),
)
@classmethod
def from_crawler(cls, crawler):
return cls(
MYSQL_HOST,
MYSQL_PORT,
MYSQL_USER,
MYSQL_PASSWORD,
MYSQL_DB,
)