利用 BeautifulSoup + Requests 爬取 妹子圖
import requests
import re, time, os
from bs4 import BeautifulSoup
urls = ["http://www.meizitu.com/a/list_1_{}.html".format(i) for i in range(1,31)] #目前共92頁
headers1 = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch', #妹子圖的請(qǐng)求頭
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'bdshare_firstime=1477909035413; Hm_lvt_a01ff1f91d0b936673f038453940cdb9=1477909035,1477909103; safedog-flow-item=F05CF6535242D231B430A78792F9D78D; CNZZDATA30056528=cnzz_eid%3D318212343-1488381613-null%26ntime%3D1488449174',
'Host':'www.meizitu.com',
'If-Modified-Since':'Tue, 21 Feb 2017 15:45:20 GMT',
'If-None-Match':"6470d82598cd21:196c",
'Referer':'http://www.meizitu.com/a/list_1_1.html',
'pgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
headers2 = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch', #圖片下載鏈接在另一個(gè)網(wǎng)站,有反爬硕并,所以另建一個(gè)請(qǐng)求頭
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'__jsluid=a0a09999bc1cd95bb78e3cfc51c0b9d8; safedog-flow-item=2676F109CF0E6A11F1AB1ADC63D76F97',
'Host':'mm.howkuai.com',
'If-Modified-Since':'Sat, 19 Nov 2016 20:12:20 GMT',
'If-None-Match':"16808f3ba142d21:1527",
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
def Get_url(url):
links = []
titles = []
web_data = requests.get(url, headers=headers1)
web_data.encoding = 'gb2312' #改為該網(wǎng)站編碼
soup = BeautifulSoup(web_data.text, 'lxml')
picture_websites_location = soup.find_all("div", class_="pic") #首頁的鏈接存在這
for i in range(len(picture_websites_location)):
links.append(picture_websites_location[i].find_all(target="_blank")[0]["href"]) #該內(nèi)容的鏈接全在這個(gè)標(biāo)簽內(nèi)法焰,find_all返回列表,所以應(yīng)該用list[0]操作
# links.append(picture_websites_location[i].find_all(href=re.compile(".*?")).get("href")) 與上面一樣
for i in range(len(picture_websites_location)):
titles.append(picture_websites_location[i].find_all("img")[0]["alt"].strip('<b>').strip('</'))
# 有的標(biāo)題含有<b>...</b>,連用兩次strip去除倔毙,前面過程與取內(nèi)容鏈接一致
return links, titles #返回內(nèi)容鏈接與內(nèi)容標(biāo)題
def Get_picture_link(website, filename): #傳入內(nèi)容網(wǎng)址埃仪,即為圖片所在網(wǎng)址
pictures = []
titles = []
os.mkdir(filename) #用傳入的內(nèi)容標(biāo)題創(chuàng)建文件夾目錄,儲(chǔ)存對(duì)應(yīng)內(nèi)容的圖片
time.sleep(4) #防止請(qǐng)求頻繁被封IP
web_data = requests.get(website, headers= headers1)
web_data.encoding = 'gb2312'
soup = BeautifulSoup(web_data.text, 'lxml')
links = soup.find_all(id = "picture") #返回實(shí)際為只有列表[0]陕赃,因?yàn)閜icture這標(biāo)簽在里面只有一個(gè)
links_real = links[0].find_all("img") #圖片的鏈接都存在列表[0]里卵蛉,先排除多余標(biāo)簽再取出圖片地址
for i in range(len(links_real)):
pictures.append(links_real[i]["src"])
for i in range(len(links_real)):
titles.append(links_real[i]["alt"])
for i in range(len(pictures)):
time.sleep(2)
picture = requests.get(pictures[i], headers=headers2) #用requests下載圖片
if picture.status_code == 200:
a = os.getcwd()
path = a + '\\' +filename + '\\'+ titles[i] + '.jpg' #對(duì)應(yīng)的圖片放到對(duì)應(yīng)的文件夾里
open(path, 'wb').write(picture.content)
print('完成了一個(gè)文件夾')
if __name__ == '__main__':
for url in urls:
pic, tit = Get_url(url)
for i in range(len(pic)):
Get_picture_link(pic[i], tit[i])
#大網(wǎng)址——提取——>圖片所在網(wǎng)址>——提取——圖片鏈接