urllib3與早前版本稍有差異错邦,記錄下幾個(gè)調(diào)整的版本(目前只有版本一,關(guān)于bug的改進(jìn)版本后續(xù)再貼,有熱心的小伙伴也可以提提建議~)
版本一
剛?cè)腴T,在網(wǎng)上搜刮了一些代碼仿吞,又調(diào)整了一下达舒,可以將百度圖片根據(jù)某關(guān)鍵字搜到的結(jié)果下載到本地唬复,但版本一這段代碼會(huì)出現(xiàn)如下從錯(cuò)誤
- 下載損害的圖片,無(wú)法打開
- 程序卡在某個(gè)圖片的URL
- 重復(fù)url下載
import re
import urllib3
def getHtml(url):
http = urllib3.PoolManager()
r = http.request('GET', url)
htmlStr=r.data.decode('utf-8')
print(htmlStr)
return htmlStr
def getImg(htmlStr):
reg = r':"(http://[^"]+\.(?:jpg|png|gif))"?'
imgre = re.compile(reg)
imglist = imgre.findall(htmlStr)
saveImage(imglist,'/home/yijie/PycharmProjects/osm/gouwuzhongxin')
def saveImage(imglist,name):
number = 1
http = urllib3.PoolManager()
for imageURL in imglist:
print(imageURL)
splitPath = imageURL.split('.')
fileExt = splitPath.pop()
fileName = name + "/" + str(number) + "." + fileExt #name是存儲(chǔ)圖片的路徑下梢,number作為圖片名
# 下載單個(gè)imageURL的圖片
r = http.request('GET', imageURL)
data = r.data
f = open(fileName, 'wb+')
f.write(data)
print(u'正在保存的一張圖片為:%s', fileName)
f.close()
number += 1
print('\ntotal number of image:%s',(name,number))
if __name__=='__main__':
s=r'http://image.baidu.com/search/index?ct=201326592&z=0&s=0&tn=baiduimage&ipn=r&word=%E8%B4%AD%E7%89%A9%E4%B8%AD%E5%BF%83%E5%B9%B3%E9%9D%A2%E5%9B%BE&pn=0&istype=2&ie=utf-8&oe=utf-8&cl=2&lm=7&st=-1&fr=&fmq=1508290519080_R&ic=0&se=&sme=&width=0&height=0&face=0'
html = getHtml(s)
getImg(html)