1 基本思路
- 主頁面分析厦坛,通過xpath獲得需要圖片的子頁面url
- 子頁面分析,通過re提取每張圖片的下載路徑
- 下載圖片
2 代碼
import requests
from lxml import etree
import re
import json
#主頁面分析
resp=requests.get("https://desk.zol.com.cn/")
resp.encoding="gbk"
et=etree.HTML(resp.text)
urls=[]
for item in et.xpath("http://ul[@id='newPicList']/li/a/@href"):
urls.append("https://desk.zol.com.cn/"+item)
#子頁面分析
obj=re.compile(r'var deskPicArr.*?=(?P<deskpicarr>.*?);',re.S)
sizes=[]
srcs=[]
for i in range(len(urls)):
tmp=requests.get(urls[i])
print("第",i,"面已經(jīng)爬取")
tmp.encoding="utf-8"
deskpicarr=(obj.search(tmp.text)).group("deskpicarr")
deskPic=json.loads(deskpicarr)
for j in deskPic["list"]:
size=j.get("oriSize")
src=j.get("imgsrc")
print(size,src)
sizes.append(size)
srcs.append(src)
imgloads=[]
for i in range(len(sizes)):
imgloads.append(re.sub(r"##SIZE##",sizes[i],srcs[i]))
#下載圖片
for i in range(len(imgloads)):
resp_img=requests.get(imgloads[i])
print("img{}.jpg".format(i),"is loading!")
with open("img{}.jpg".format(i),mode="wb") as f:
f.write(resp_img.content)
print("loading is over")