Python Scrapy
什么是爬蟲爷绘?
網絡爬蟲(英語:web crawler)捌治,也叫網絡蜘蛛(spider)塑径,是一種用來自動瀏覽萬維網的網絡機器人衔彻。其目的一般為編纂網絡索引薇宠。
Python 爬蟲
在爬蟲領域,Python幾乎是霸主地位艰额,將網絡一切數(shù)據(jù)作為資源澄港,通過自動化程序進行有針對性的數(shù)據(jù)采集以及處理。從事該領域應學習爬蟲策略柄沮、高性能異步IO回梧、分布式爬蟲等,并針對Scrapy框架源碼進行深入剖析祖搓,從而理解其原理并實現(xiàn)自定義爬蟲框架狱意。
Python 爬蟲爬蟲框架 Scrapy
Scrapy 是用 Python 編寫實現(xiàn)的一個為了爬取網站數(shù)據(jù)、提取結構性數(shù)據(jù)而編寫的應用框架拯欧。Scrapy 常應用在包括數(shù)據(jù)挖掘详囤,信息處理或存儲歷史數(shù)據(jù)等一系列的程序中。
Python Scrapy 核心
Scrapy Engine(引擎): 負責Spider镐作、ItemPipeline藏姐、Downloader、Scheduler中間的通訊该贾,信號包各、數(shù)據(jù)傳遞等。
**Scheduler(調度器): **它負責接受引擎發(fā)送過來的Request請求靶庙,并按照一定的方式進行整理排列问畅,入隊,當引擎需要時六荒,交還給引擎护姆。
Downloader(下載器):負責下載Scrapy Engine(引擎)發(fā)送的所有Requests請求,并將其獲取到的Responses交還給Scrapy Engine(引擎)掏击,由引擎交給Spider來處理卵皂,
Spider(爬蟲):它負責處理所有Responses,從中分析提取數(shù)據(jù),獲取Item字段需要的數(shù)據(jù)砚亭,并將需要跟進的URL提交給引擎灯变,再次進入Scheduler(調度器).
Item Pipeline(管道):它負責處理Spider中獲取到的Item殴玛,并進行進行后期處理(詳細分析、過濾添祸、存儲等)的地方滚粟。
Downloader Middlewares(下載中間件):你可以當作是一個可以自定義擴展下載功能的組件。
Spider Middlewares(Spider中間件):你可以理解為是一個可以自定擴展和操作引擎和Spider中間通信的功能組件(比如進入Spider的Responses;和從Spider出去的Requests)
Scrapy Demo
# 創(chuàng)建爬蟲虛擬環(huán)境
$ conda create --name scrapy python=3.6
# 激活虛擬環(huán)境
$ activate scrapy
# 安裝scrapy
$ conda install scrapy
# 使用 scrapy 提供的工具創(chuàng)建爬蟲項目
$ scrapy startproject myScrapy
# 啟動爬蟲
$ scrapy crawl scrapyName
項目文件介紹
scrapy.cfg: 項目的配置文件刃泌。
mySpider/: 項目的Python模塊凡壤,將會從這里引用代碼。
mySpider/items.py: 項目的目標文件耙替。
mySpider/pipelines.py: 項目的管道文件亚侠。
mySpider/settings.py: 項目的設置文件。
mySpider/spiders/: 存儲爬蟲代碼目錄俗扇。
爬取豆瓣 top250
- item.py
import scrapy
class DoubanItem(scrapy.Item):
name = scrapy.Field()
director = scrapy.Field()
detail = scrapy.Field()
star = scrapy.Field()
synopsis = scrapy.Field()
comment = scrapy.Field()
- spiders/DoubanSpider.py
# coding:utf-8
import scrapy
from scrapy import Request
from douban.items import DoubanItem
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ['douban.com']
start_urls = ['https://movie.douban.com/top250']
def parse(self, response):
movie_list = response.xpath("http://div[@class='article']/ol/li")
if movie_list and len(movie_list) > 0:
for movie in movie_list:
item = DoubanItem()
item['name'] = movie.xpath("./div/div[2]/div[1]/a/span[1]/text()").extract()[0]
item['director'] = movie.xpath("normalize-space(./div/div[2]/div[2]/p/text())").extract_first()
item['detail'] = movie.xpath("normalize-space(./div/div[2]/div[2]/p[1]/text())").extract()[0]
item['star'] = movie.xpath("./div/div[2]/div[2]/div/span[2]/text()").extract()[0]
item['synopsis'] = movie.xpath("normalize-space(./div/div[2]/div[2]/p[2]/span/text())").extract()[0]
item['comment'] = movie.xpath("./div/div[2]/div[2]/div/span[4]/text()").extract()[0]
yield item
next_link = response.xpath("http://span[@class='next']/a/@href").extract()
if next_link:
yield Request("https://movie.douban.com/top250" + next_link[0], callback=self.parse, dont_filter=True)
- pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from database_handler import DatabaseHandler
class DoubanPipeline(object):
def __init__(self):
self.db = DatabaseHandler(host="xxx", username="xxx", password="xxx", database="xxx")
def close_spider(self, spider):
self.db.close()
# 將 Item 實例保存到文件
def process_item(self, item, spider):
sql = "insert into t_douban(name,director,detail,star,synopsis,comment) values('%s', '%s', '%s', '%s', '%s', '%s')" % (
item['name'], item['director'], item['detail'], item['star'], item['synopsis'], item['comment'])
self.db.insert(sql)
return item
- DatabaseHandler.py
# coding:utf-8
import pymysql
from pymysql.err import *
class DatabaseHandler(object):
def __init__(self, host, username, password, database, port=3306):
"""初始化數(shù)據(jù)庫連接"""
self.host = host
self.username = username
self.password = password
self.port = port
self.database = database
self.db = pymysql.connect(self.host, self.username, self.password, self.database, self.port, charset='utf8')
self.cursor = None
def execute(self, sql):
"""執(zhí)行SQL語句"""
try:
self.cursor = self.db.cursor()
self.cursor.execute(sql)
self.db.commit()
except (MySQLError, ProgrammingError) as e:
print(e)
self.db.rollback()
else:
print("rowCount: %s rowNumber: %s" % (self.cursor.rowcount, self.cursor.rownumber))
finally:
self.cursor.close()
def update(self, sql):
""" 更新操作"""
self.execute(sql)
def insert(self, sql):
"""插入數(shù)據(jù)"""
self.execute(sql)
return self.cursor.lastrowid
def insert_bath(self, sql, rows):
"""批量插入"""
try:
self.cursor.executemany(sql, rows)
self.db.commit()
except (MySQLError, ProgrammingError) as e:
print(e)
self.db.rollback()
else:
print("rowCount: %s rowNumber: %s" % (self.cursor.rowcount, self.cursor.rownumber))
finally:
self.cursor.close()
def delete(self, sql):
"""刪除數(shù)據(jù)"""
self.execute(sql)
def select(self, sql):
"""查詢數(shù)據(jù) 返回 map 類型的數(shù)據(jù)"""
self.cursor = self.db.cursor(cursor=pymysql.cursors.DictCursor)
result = []
try:
self.cursor.execute(sql)
data = self.cursor.fetchall()
for row in data:
result.append(row)
except MySQLError as e:
print(e)
else:
print(f"rowCount: {self.cursor.rowcount} rowNumber: {self.cursor.rownumber}")
return result
finally:
self.cursor.close()
def call_proc(self, name):
"""調用存儲過程"""
self.cursor.callproc(name)
return self.cursor.fetchone()
def close(self):
"""關閉連接"""
self.db.close()
if __name__ == "__main__":
pass
- 修改settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
ITEM_PIPELINES = {
'mySpider.pipelines.DoubanPipeline': 300,
}
run
scrapy crawl douban