寫在前面
這么久了,代碼還是那么爛胞四。。辜伟。。
環(huán)境:Windows10
編輯器:Pycharm
用到的庫:os requests bs4
URL = www.dbmeinv.com
開始爬取
第一步:獲取單頁圖片地址
網(wǎng)頁結(jié)構(gòu)
圖片放在ul標簽下的img標簽下面导狡,所以我們可以這樣寫代碼
def get_Imgs(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
response = requests.get(url, headers=headers)
response.raise_for_status() //判斷狀態(tài)碼是否為200
response.encoding = response.apparent_encoding
#response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
for url in soup.find('ul', {'class':'thumbnails'}).find_all('img'):
all_url.append(url['src'])
return all_url
except:
return "error"
第二步:抓取多頁
URL結(jié)構(gòu)
抓取多頁的話就可以改一下URL后邊的數(shù)字偎痛,代碼可以這樣寫
url = 'https://www.dbmeinv.com/?pager_offset='
try:
for i in range(10):
get_Imgs(url + str(i))
except:
return "error"
第三步:保存圖片
def save_imgs():
dir_name = 'pic'
if not os.path.exists(dir_name):
os.mkdir(dir_name)
os.chdir(dir_name)
try:
for i,url in enumerate(all_url):
with open('./' + str(i) + '.jpg', 'wb') as f:
f.write(requests.get(url).content)
except:
return "error"
放上完整的代碼
import os
import requests
from bs4 import BeautifulSoup
all_url = []
def get_Imgs(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'lxml')
for url in soup.find('ul', {'class':'thumbnails'}).find_all('img'):
all_url.append(url['src'])
return all_url
except:
return "error"
def save_imgs():
dir_name = 'pic'
if not os.path.exists(dir_name):
os.mkdir(dir_name)
os.chdir(dir_name)
try:
for i,url in enumerate(all_url):
with open('./' + str(i) + '.jpg', 'wb') as f:
f.write(requests.get(url).content)
except:
return "error"
def main():
url = 'https://www.dbmeinv.com/?pager_offset='
try:
for i in range(200):
get_Imgs(url + str(i))
except:
return "error"
save_imgs()
if __name__ == '__main__':
main()
嘿嘿嘿