image.png
偽裝成瀏覽器籽暇,請求頁面癣朗,并下載網(wǎng)頁
import urllib.request
URL = "https://www.hao123.com/manhua/detail/176"
header ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36'}
opener = urllib.request.build_opener()
opener.add_handler= [header]
data = opener.open(URL).read()
Hfile = open("/Users/vincentwen/Downloads/file.html","wb")
Hfile.write(data)
Hfile.close()
image.png
爬取漫畫網(wǎng)站的首頁的
import re
import urllib.request
import urllib.error
#讀取需要爬取的網(wǎng)址
Readdata= urllib.request.urlopen("http://www.pufei.net/").read()
#對讀取的結(jié)果進(jìn)行編碼
data = Readdata.decode("utf-8","ignore")
#定義正則表達(dá)式,匹配manhua目錄下的所有網(wǎng)址
pat= '
#匹配網(wǎng)頁中所有的符合條件的url鏈接地址
allurl= re.compile(pat).findall(data)
for i in range(0,len(allurl)):
try:
print("第"+str(i)+"抓取")
thisurl= allurl[i]
file= "/Users/vincentwen/Downloads/"+str(i)+".html"
urllib.request.urlretrieve(thisurl, file)
print("------抓取成功-----")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
image.png