嘿嘿 召喚老色批
今天帶大家爬去一下美女的圖片
用的是requests和xpath去解析
獲取網(wǎng)頁和解析網(wǎng)頁的函數(shù)
def get_tag(response,tag):
html=etree.HTML(response)
ret=html.xpath(tag)
return ret
def parse_url(url):
response=requests.get(url,headers=headers)
return response.text
獲取網(wǎng)頁url
def url_find(url):
r=parse_url(url)
url_list=get_tag(r,'//*[@id="pins"]/li/span[1]/a/@href')
title=get_tag(r, '//*[@id="pins"]/li/span[1]/a/text()')
# print(len(url_list))
for i in range(len(url_list)):
url_jpg_find(url_list[i],title[i])
print(title,'保存完畢')
獲取圖片的url
def url_jpg_find(url,title):
global page
page=0
r=parse_url(url)
url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0])
url_list=[url]+[url + '/' + str(i) for i in range(2, url_last + 1)]
if not os.path.exists(title):
os.makedirs(title)
# else:
# return
for i in url_list:
content_find(i,title)
# break
獲取圖片的信息
def content_find(url,title):
# print(url)
r=parse_url(url)
# print(r)
name=get_tag(r,'/html/body/div[2]/div[1]/h2/text()')[0]
url_jpg=get_tag(r,'//div[@class="main-image"]//a/img/@src')[0]
# print(name,url_jpg)
time.sleep(0.2)
save(name,url_jpg,title)
保存圖片
def save(name,url_jpg,title):
global page
r=requests.get(url_jpg,headers=headers)
with open(os.getcwd()+'/'+title+'/'+name+'.jpg','wb') as j:
j.write(r.content)
j.close()
page+=1
print(page)
import requests,os,time
from lxml import etree
headers={
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Referer" : "https://www.mzitu.com",
}
page=0
def get_tag(response,tag):
html=etree.HTML(response)
ret=html.xpath(tag)
return ret
def parse_url(url):
response=requests.get(url,headers=headers)
return response.text
def url_find(url):
r=parse_url(url)
url_list=get_tag(r,'//*[@id="pins"]/li/span[1]/a/@href')
title=get_tag(r, '//*[@id="pins"]/li/span[1]/a/text()')
# print(len(url_list))
for i in range(len(url_list)):
url_jpg_find(url_list[i],title[i])
print(title,'保存完畢')
def url_jpg_find(url,title):
global page
page=0
r=parse_url(url)
url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0])
url_list=[url]+[url + '/' + str(i) for i in range(2, url_last + 1)]
if not os.path.exists(title):
os.makedirs(title)
# else:
# return
for i in url_list:
content_find(i,title)
# break
def content_find(url,title):
# print(url)
r=parse_url(url)
# print(r)
name=get_tag(r,'/html/body/div[2]/div[1]/h2/text()')[0]
url_jpg=get_tag(r,'//div[@class="main-image"]//a/img/@src')[0]
# print(name,url_jpg)
time.sleep(0.2)
save(name,url_jpg,title)
def save(name,url_jpg,title):
global page
r=requests.get(url_jpg,headers=headers)
with open(os.getcwd()+'/'+title+'/'+name+'.jpg','wb') as j:
j.write(r.content)
j.close()
page+=1
print(page)
def main():
start_url='https://www.mzitu.com'
r=parse_url(start_url)
url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[3]/div/a[4]/text()')[0])
url='https://www.mzitu.com/page/'
url_list=['https://www.mzitu.com']+[url+str(i) for i in range(2,url_last+1)]
# print(url_list)
for url in url_list:
url_find(url)
# break
if __name__ == '__main__':
main()
效果圖就不放了
咳咳 太誘人 會被封掉
請大家自行腦補(bǔ)一下
一起學(xué)習(xí)python猾漫,小白指導(dǎo)蹋绽,教學(xué)分享記得私信我