初學爬蟲入坑爬煎蛋的教程坦胶,無奈煎蛋有反爬蟲機制獲取不到真實圖片地址惊窖,研究了兩天敷扫,自己寫了一個哀蘑,代碼很簡單,便于理解葵第。
import time,requests
from selenium import webdriver
from bs4 import BeautifulSoup
star = time.time()
browser = webdriver.Chrome()
n = 1
total = 0
for num in range(48,0,-1):
browser.get('http://jandan.net/ooxx/page-'+str(num)+'#comments')
data = browser.page_source
soup = BeautifulSoup(data,'lxml')
download_links = []
folder_path = 'C:\\Users\\Administrator\\Desktop\\JD\\'
# print('===========第' + str(num) + '頁===============')
for pic_tag in soup.find_all('img'):
pic_link = pic_tag.get('src')
download_links.append(pic_link)
for item in download_links:
try:
urllib.request.urlretrieve(item,folder_path + item[-10:])
with open(folder_path + item[-10:],'wb',) as f:
f.write(requests.get(item).content)
print('正在下載第{}圖片'.format(n))
total += 1
except:
print('第{}張圖片下載出錯绘迁,已跳過'.format(n))
n += 1
browser.close()
end = time.time()
print('總共用時{}分'.format((end-star)/60))
print('成功下載{}張圖片,失敗{}張圖片'.format(total,n-total))