宅男爬蟲學(xué)習(xí)第一課!?宅男們的福利來啦~?
話不多說广辰,直接上代碼!
# -*- encoding: utf-8 -*-
# FUNCTION: Capture beauty picture
import requests
from bs4 import BeautifulSoup
import os
import time
url_list = ['http://www.mzitu.com/201024', 'http://www.mzitu.com/169782']? # interested beauties
headers = {
? ? ? ? 'referer': 'https://www.mzitu.com/201024',
? ? ? ? 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 '
? ? ? ? ? ? ? ? ? ? ? 'Safari/537.36'
}
def get_page_num(url):
? ? response = requests.get(url, headers=headers)
? ? soup = BeautifulSoup(response.text, 'lxml')
? ? page_num = soup.find(class_='pagenavi').find_all('a')[-2].text
? ? name = soup.find(class_='currentpath').text.split()[-1]
? ? return page_num, name? ? ? ? ? # page_num 是字符串
def parse_page(url):
? ? """
? ? 得到一頁的圖片
? ? :param url: 頁面URL
? ? :return: 圖片鏈接,圖片名稱
? ? """
? ? response = requests.get(url, headers=headers)
? ? soup = BeautifulSoup(response.text, 'lxml')
? ? pic_url = soup.find(class_='main-image').find('img')['src']
? ? pic_name = soup.find(class_='main-title').text
? ? return pic_url, pic_name
def get_pic(pic_url, pic_name, name):
? ? """下載并保存圖片"""
? ? response = requests.get(pic_url, headers=headers, allow_redirects=False)
? ? filepath = '/home/f/crawler/Beauty/photo/' + name + '/' + pic_name + '.jpg'
? ? with open(filepath, 'wb') as f:
? ? ? ? f.write(response.content)
def main():
? ? for url in url_list:
? ? ? ? page_num, name = get_page_num(url)
? ? ? ? try:
? ? ? ? ? ? os.mkdir('/home/f/crawler/Beauty/photo/' + name)
? ? ? ? except FileExistsError:
? ? ? ? ? ? pass
? ? ? ? for page in range(1, int(page_num) + 1):? # range迭代
? ? ? ? ? ? page_url = url + '/' + str(page)
? ? ? ? ? ? print(page_url)
? ? ? ? ? ? pic_url, pic_name = parse_page(page_url)
? ? ? ? ? ? get_pic(pic_url, pic_name, name)
? ? ? ? time.sleep(2)
if __name__ == '__main__':
? ? main()
可以收藏一下翩活,慢慢學(xué)習(xí)哈阱洪!
————————————————————————————————————————————
微信關(guān)注號:**python爬蟲機(jī)器學(xué)習(xí)深度學(xué)習(xí)**