安裝scrapy
pip install Scrapy
進(jìn)入終端戳玫,切換到自己項(xiàng)目代碼的工作空間下,執(zhí)行
scrapy startproject baidu_pic_spider
image.png
生成如下工程文件:
images是自己創(chuàng)建的用于存放爬到的圖片目錄肃拜。
image.png
在spiders目錄下創(chuàng)建baidu_pic_spider爬蟲文件,search_word可改成自己需要的搜索詞。
baidu_pic_spider.py
# -*- coding: utf-8 -*-
import scrapy, json
from scrapy.http import Request
from PicSpider.items import PicItem # 導(dǎo)入item
class PicSpider(scrapy.Spider):
name = "pic_spider"
allowed_domains = ["http://image.baidu.com/"]
start_urls = ["http://image.baidu.com"]
def parse(self, response): # 定義解析函數(shù)
search_word = '哈士奇' # 查找詞包吝,可修改
baidu_pic_url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word={0}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&pn=60&rn=30&gsm=3c&1507915209449=".format(
search_word) # 百度圖片url
# 將帶關(guān)鍵詞參數(shù)的url交給request函數(shù)解析锥余,返回的response通過(guò)get_pic回調(diào)函數(shù)進(jìn)一步分析
yield Request(baidu_pic_url, meta={"search_word": search_word}, callback=self.get_pic, dont_filter=True)
def get_pic(self, response): # 從圖片list中獲取每個(gè)pic的信息
item = PicItem() # 實(shí)例化item
response_json = response.text # 存儲(chǔ)返回的json數(shù)據(jù)
response_dict = json.loads(response_json) # 轉(zhuǎn)化為字典
response_dict_data = response_dict['data'] # 圖片的有效數(shù)據(jù)在data參數(shù)中
for pic in response_dict_data: # pic為每個(gè)圖片的信息數(shù)據(jù)腹纳,dict類型
if pic:
item['search_word'] = response.meta['search_word'] # 搜索關(guān)鍵詞賦值
item['pic_url'] = [pic['middleURL']] # 百度圖片搜索結(jié)果url (setting中pic_url應(yīng)該為數(shù)組形式)
item['pic_name'] = pic['fromPageTitleEnc'] # 百度圖片搜索結(jié)果對(duì)應(yīng)的title
yield item
新建main.py文件,方便在pycharm中運(yùn)行和調(diào)試爬蟲驱犹。
main.py
# _*_ coding: utf-8 _*_
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__))) #設(shè)置工程目錄
print(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy","crawl","pic_spider"]).strip()
定義item字段
item.py
# -*- coding: utf-8 -*-
import scrapy
class PicItem(scrapy.Item) :
search_word = scrapy.Field() #搜索關(guān)鍵字
pic_name = scrapy.Field() #圖片標(biāo)題
pic_url = scrapy.Field() #圖片url
pass
定義pipeline
pipeline.py
# -*- coding: utf-8 -*-
class PicspiderPipeline(object):
def process_item(self, item, spider):
return item
在setting中對(duì)應(yīng)部分修改ITEM_PIPELINES嘲恍,并增加圖片處理代碼
settings.py
ITEM_PIPELINES = {
'PicSpider.pipelines.PicspiderPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline' : 1,
}
#配置pipeline,設(shè)定需要進(jìn)行處理的圖片路徑
IMAGES_URLS_FIELD = "pic_url"
# 設(shè)置圖片下載后的存儲(chǔ)路徑雄驹,放到工程目錄下images文件夾
# 獲取當(dāng)前目錄絕對(duì)路徑
project_dir = os.path.abspath(os.path.dirname(__file__))
# 獲取images存儲(chǔ)路徑
IMAGES_STORE = os.path.join(project_dir,'images')
# 設(shè)定處理圖片的最小高度佃牛,寬度
IMAGES_MIN_HEIGHT = 100
IMAGES_MIN_WIDTH = 100
運(yùn)行
run main.py
image.png