爬取內(nèi)容.png
你懂得...
嘿嘿嘿~
先上代碼:
'''
Data:2020
--- 大威鍋丨DaWeiGuo ---
'''
import requests
import re
from bs4 import BeautifulSoup
import os#導(dǎo)入所需要的庫(kù)
url='http://www.win4000.com/zt/xinggan.html'
#從url提取每個(gè)系列對(duì)應(yīng)的鏈接
def GetHtml(url):
try:
r=requests.get(url)
r.raise_for_status()
r.encoding=r.apparent_encoding
txt = r.text
accessUrl(txt)#將返回的網(wǎng)頁(yè)信息傳入accessUrl函數(shù)
except :
print('訪問(wèn)頁(yè)面出錯(cuò)窗悯!')
#提取每個(gè)系列對(duì)應(yīng)的網(wǎng)頁(yè)鏈接
def accessUrl(txt):
soup = BeautifulSoup(txt,'html.parser')
tag_ul=soup.find_all('ul')
url_list=re.findall('http://www.win4000.com/wallpaper_detail_[\d]*.html',str(tag_ul))
browse_url(url_list)#將提取的鏈接以列表形式遞給browse_url函數(shù)
#循環(huán)每個(gè)鏈接
def browse_url(url_list):
for every_list in url_list:
solve_url(every_list)#將循環(huán)的每個(gè)鏈接遞給solve_url函數(shù)
def solve_url(url_alone):
url_alone_1=url_alone.split('_') #
url_alone_last=url_alone_1[-1] #
url_alone_num=url_alone_last.split('.')[-2] # 從鏈接提取相關(guān)信息為后面組成鏈接實(shí)現(xiàn)翻頁(yè)做鋪墊
# print(url_alone_num)
try:
for i in range(0,15):#實(shí)現(xiàn)翻頁(yè),并保存每頁(yè)的圖片,有的系列圖片是翻一次頁(yè)數(shù)字加二有的是加1乐疆,這里就寫0-15,
# 可能會(huì)出現(xiàn)一個(gè)額系列沒(méi)爬完的情況
if i == 0:#因?yàn)榈谝豁?yè)的鏈接與后面的鏈接相比少了_數(shù)字部分,所以分開(kāi)處理
get_photo_url(url_alone)#將第一頁(yè)的鏈接遞給get_photo_url函數(shù)處理
else:
url_alone_compose='http://www.win4000.com/wallpaper_detail_'+url_alone_num+'_'+str(i)+'.html'
# print(url_alone_compose)
get_photo_url(url_alone_compose)#將重新組合的鏈接遞給 get_photo_url函數(shù)處理
except:
print('圖片頁(yè)面不存在')
#這個(gè)函數(shù)從網(wǎng)頁(yè)提取圖片所對(duì)應(yīng)的鏈接
def get_photo_url(url_photo):
try:
r=requests.get(url_photo)
r.raise_for_status()
r.encoding=r.apparent_encoding
soup = BeautifulSoup(r.text,'html.parser')
tag=soup.find(class_="pic-large")
url_photo=tag['src']
#print(url_photo)
holdphoto(url_photo)#將提取的圖片對(duì)應(yīng)的鏈接交給holdphoto(url_photo)函數(shù)進(jìn)行保存
except:
print('獲取圖片鏈接失敗')
#圖片保存函數(shù)
def holdphoto(url_photo):
# root="C:/Users/l1768/Desktop/福利圖/"#圖片要儲(chǔ)存的文件夾
root = os.path.abspath(os.path.dirname(__file__))
root = root + '\\福利圖/'
name=url_photo.split('/')[-1]#提取每個(gè)圖片的名字
path=root+name#組成圖片路徑
try:
if not os.path.exists(root):#判斷文件夾是否存在,不存在則創(chuàng)建
os.mkdir(root)
if not os.path.exists(path):#判斷圖片是否已經(jīng)存在然磷,不存在則保存
r=requests.get(url_photo)
with open(path,"wb") as f:
f.write(r.content)
f.close()
print('---'+name+"保存成功"+'---')
else:
print("---文件已存在---")
except:
print("---圖片爬取失敗---")
if __name__ == "__main__":
print('----開(kāi)始爬取圖片----')
GetHtml(url)
print('----爬取結(jié)束----')
運(yùn)行圖.png