以下為代碼
#!/usr/bin/python
# -*- coding: utf-8 -*-
#@Author : BigBro
#@DateTime : 2015-09-25 16:14:29
#@Filename : crawler_tieba.py
#@Description : 爬蟲(chóng),百度貼吧,海賊王
import urllib.request
import os,re,socket
from bs4 import BeautifulSoup
r_url=input('輸入網(wǎng)址:') #復(fù)制貼吧新番地址
url=urllib.request.urlopen(r_url)
soup=BeautifulSoup(url)
tag_list=soup.select('img[class="BDE_Image"]') #百度圖片 屬性
jpg_url_list=[] #圖片地址列表
pattern=re.compile(r'http.*?jpg')
for item in tag_list:
match=pattern.search(str(item))
if match:
jpg_url_list.append(match.group())
path = os.getcwd() #取得當(dāng)前的執(zhí)行路徑
pic=str(soup.title.string)
if not os.path.exists(path+'\\'+pic+'\\'):
os.mkdir(path+'\\'+pic+'\\')
path = path+'\\'+pic+'\\' #圖片保存文件夾
counter=0
#可能圖片順序還有問(wèn)題,暫時(shí)默認(rèn)已經(jīng)排序好
for url_item in jpg_url_list:
filename=str(counter)+'.jpg'
if not os.path.exists(path+filename):
with open(path+filename,mode='wb') as i:
print(filename)
try:
resp=urllib.request.urlopen(url_item, timeout=5)
img=resp.read()
i.write(img)
except socket.timeout:
#raise socket.timeout
continue
counter +=1
else:
print('%d exists,then continue' %(counter))
counter +=1
print('Downloading is done')