python爬蟲爬取王者榮耀英雄圖片
python爬取數(shù)據(jù)四步走
1奋隶、確定目標
2、分析目標
3陶贼、編寫代碼
4、執(zhí)行爬蟲
1待秃、確定目標
爬取目標:
url = 'https://pvp.qq.com/web201605/herolist.shtml'
注意:有時候有些頁面是異步加載拜秧,直接請求url地址是獲得不了數(shù)據(jù)的。而王者榮耀英雄頁面就是異步加載的章郁,英雄列表數(shù)據(jù)是動態(tài)請求的枉氮,F(xiàn)12或者selenium分析找到一個json文件:
# json文件地址
https://pvp.qq.com/web201605/js/herolist.json
2、分析目標
思路:
先在英雄列表得到所有的英雄信息暖庄,然后依次循環(huán)爬取單個英雄的信息聊替,得到單個英雄的所有皮膚。
選擇幾個英雄的路徑如下:
<a href="herodetail/105.shtml" target="_blank"><img src="http://game.gtimg.cn/images/yxzj/img201606/heroimg/105/105.jpg" alt="廉頗" width="91" height="91">廉頗</a>
# 對應(yīng)的json對象
{
"0": {
"ename": 105,
"cname": "廉頗",
"title": "正義爆轟",
"new_type": 0,
"hero_type": 3,
"skin_name": "正義爆轟|地獄巖魂"
}
}
<a href="herodetail/106.shtml" target="_blank"><img src="http://game.gtimg.cn/images/yxzj/img201606/heroimg/106/106.jpg" alt="小喬" width="91" height="91">小喬</a>
# 對應(yīng)的json對象
{
"1": {
"ename": 106,
"cname": "小喬",
"title": "戀之微風(fēng)",
"new_type": 0,
"hero_type": 2,
"skin_name": "戀之微風(fēng)|萬圣前夜|天鵝之夢|純白花嫁|繽紛獨角獸"
}
}
<a href="herodetail/107.shtml" target="_blank"><img src="http://game.gtimg.cn/images/yxzj/img201606/heroimg/107/107.jpg" alt="趙云" width="91" height="91">趙云</a>
# 對應(yīng)的json對象
{
"2": {
"ename": 107,
"cname": "趙云",
"title": "蒼天翔龍",
"new_type": 0,
"hero_type": 1,
"hero_type2": 4,
"skin_name": "蒼天翔龍|忍●炎影|未來紀元|皇家上將|嘻哈天王|白執(zhí)事|引擎之心"
}
}
<a href="herodetail/108.shtml" target="_blank"><img src="http://game.gtimg.cn/images/yxzj/img201606/heroimg/108/108.jpg" alt="墨子" width="91" height="91">墨子</a>
# 對應(yīng)的json對象
{
"3": {
"ename": 108,
"cname": "墨子",
"title": "和平守望",
"new_type": 0,
"hero_type": 2,
"hero_type2": 1,
"skin_name": "和平守望|金屬風(fēng)暴|龍騎士|進擊墨子號"
}
}
<a href="herodetail/109.shtml" target="_blank"><img src="http://game.gtimg.cn/images/yxzj/img201606/heroimg/109/109.jpg" alt="妲己" width="91" height="91">妲己</a>
# 對應(yīng)的json對象
{
"4": {
"ename": 109,
"cname": "妲己",
"title": "魅力之狐",
"pay_type": 11,
"new_type": 0,
"hero_type": 2,
"skin_name": "魅惑之狐|女仆咖啡|魅力維加斯|仙境愛麗絲|少女阿貍|熱情桑巴"
}
}
根據(jù)上述培廓,可以推測出英雄圖片地址:
# [http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/(英雄編號)/(英雄編號)-bigskin-(第幾個皮膚).jpg](http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/137/137-bigskin-1.jpg)
https//game.gtimg.cn/images/yxzj/img201606/heroimg + ename + ename + .jpg
3惹悄、編寫代碼
封裝請求方法、
# 封裝請求方法
def send_get(self, url):
try:
response = requests.get(url, headers=self.headers)
# 斷言測試
assert response.status_code == 200, '{}請求失敗'.format(url)
return response
except Exception as e:
print(e)
return None
獲得首頁英雄列表肩钠、
# 獲得所有的英雄列表
def hero_list(self):
# 獲取英雄列表
response = self.send_get(self.hero_list_url)
if response:
hero_list_text = response.text # 獲得響應(yīng)數(shù)據(jù)
hero_list_dict = json.loads(hero_list_text) # 響應(yīng)數(shù)據(jù)封裝json
self.process_heroes(**hero_list_dict) # 調(diào)用處理方法
else:
print('英雄列表為空,請檢查獲取URL:{}'.format(self.hero_list_url))
處理英雄列表數(shù)據(jù)泣港、
# 對英雄列表進行處理
def process_heroes(self, **hero_list_dict):
for hero in hero_list_dict['hero']:
hero_info_url = self.hero_url.format(hero['heroId'])
resp = self.send_get(hero_info_url)
if resp:
hero_info_dict = json.loads(resp.text)
self.process_hero(**hero_info_dict)
else:
print('獲取英雄:{}失敗,請檢查獲取URL:{}'.format(hero['name'], self.hero_list_url))
單個英雄數(shù)據(jù)獲取
# 單個英雄處理
def process_hero(self, **hero_info_dict):
hero = hero_info_dict['hero']
skins = hero_info_dict['skins']
for skin in skins:
if skin['mainImg']:
skin_content = self.send_get(skin['mainImg']).content
hero_image_name = '{}.jpg'.format(skin['name'])
hero_image_dir = os.path.join(self.base_path, hero['name'] + hero['title'])
self.save_image(hero_image_dir, hero_image_name, skin_content)
print('hero:{},skins:{}張,處理完成'.format(hero['name'], len(skins)))
time.sleep(1)
圖片保存、
def save_image(self,image_dir, image_name, image_content):
if not os.path.exists(image_dir):
os.makedirs(image_dir)
try:
hero_image_path = os.path.join(image_dir, re.sub(r'[/|?]', '', image_name))
with open(hero_image_path, 'wb') as image:
image.write(image_content)
except Exception as e:
print('{}保存失敗,錯誤原因:{}'.format(hero_image_path, e))
4价匠、爬蟲執(zhí)行
if __name__ == '__main__':
LOLHeroSpider().hero_list()
5当纱、完整代碼
import requests
import random
import time
import re
import json
import os
# url = https://lol.qq.com/data/info-heros.shtml
# url = 'https://lol.qq.com/data/info-heros.shtml'
# 需要設(shè)置USER_AGENT,假裝自己是瀏覽器訪問網(wǎng)頁
user_agent_list = [
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'
]
USER_AGENT = random.choice(user_agent_list)
header = {
'User-Agent':USER_AGENT
}
class LOLHeroSpider():
def __init__(self):
self.header = header
self.hero_list_url = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js'
self.hero_url = 'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js'
self.base_path = os.path.join('d:' + os.path.sep, '英雄聯(lián)盟')
# 封裝請求方法
def send_get(self, url):
try:
response = requests.get(url, headers=self.header)
# 斷言測試
assert response.status_code == 200, '{}請求失敗'.format(url)
return response
except Exception as e:
print(e)
return None
# 獲得所有的英雄列表
def hero_list(self):
# 獲取英雄列表
response = self.send_get(self.hero_list_url)
if response:
hero_list_text = response.text # 獲得響應(yīng)數(shù)據(jù)
hero_list_dict = json.loads(hero_list_text) # 響應(yīng)數(shù)據(jù)封裝json
self.process_heroes(**hero_list_dict) # 調(diào)用處理方法
else:
print('英雄列表為空,請檢查獲取URL:{}'.format(self.hero_list_url))
# 對英雄列表進行處理
def process_heroes(self, **hero_list_dict):
for hero in hero_list_dict['hero']:
hero_info_url = self.hero_url.format(hero['heroId'])
resp = self.send_get(hero_info_url)
if resp:
hero_info_dict = json.loads(resp.text)
self.process_hero(**hero_info_dict)
else:
print('獲取英雄:{}失敗,請檢查獲取URL:{}'.format(hero['name'], self.hero_list_url))
# 單個英雄處理
def process_hero(self, **hero_info_dict):
hero = hero_info_dict['hero']
skins = hero_info_dict['skins']
for skin in skins:
if skin['mainImg']:
skin_content = self.send_get(skin['mainImg']).content
hero_image_name = '{}.jpg'.format(skin['name'])
hero_image_dir = os.path.join(self.base_path, hero['name'] + hero['title'])
self.save_image(hero_image_dir, hero_image_name, skin_content)
print('hero:{},skins:{}張,處理完成'.format(hero['name'], len(skins)))
time.sleep(1)
# 圖片保存
def save_image(self,image_dir, image_name, image_content):
if not os.path.exists(image_dir):
os.makedirs(image_dir)
try:
hero_image_path = os.path.join(image_dir, re.sub(r'[/|?]', '', image_name))
with open(hero_image_path, 'wb') as image:
image.write(image_content)
except Exception as e:
print('{}保存失敗,錯誤原因:{}'.format(hero_image_path, e))
if __name__ == '__main__':
LOLHeroSpider().hero_list()