urllib提供了一系列用于操作URL的功能闰渔。
from urllib import request? #引用urllib
resp = request.urlopen("http://www.baidu.com")? #打開網(wǎng)頁(yè),可以直接urlopen也可以先傳入Request再傳入urlopen
print(resp.read().decode("UTF-8"))? #讀取內(nèi)容設(shè)置編碼模式
from urllib import request
req = request.Request("http://www.baidu.com")
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko") #模擬瀏覽器秀存,以免被網(wǎng)站識(shí)別為爬蟲。
resp = request.urlopen(req)
print(resp.read().decode("UTF-8"))
http://www.thsrc.com.tw/tw/TimeTable/SearchResult網(wǎng)站為例湾碎,NetWork中的Doc的SearchResult的Origin和Use-Agent是我們需要的球昨,為了不讓網(wǎng)站認(rèn)出你是爬蟲烤蜕。
#from urllibimport request? 如果用了這種寫法,后面要寫request.Request 以及 request.urlopen
#from urllib.request import urlopen? 如果用了這種寫法韵吨,后面就要寫Request不能寫request.Request
#from urllib.request import Request? 這個(gè)同上匿垄。
from urllib import request
#from urllib.request import urlopen
#from urllib.request import Request
from urllibimport parse
req = request.Request("http://www.thsrc.com.tw/tw/TimeTable/SearchResult") ?
#request.Request可以request.Request().add_header 可以?request.Request().data
postData = parse.urlencode([
("StartStation","2f940836-cedc-41ef-8e28-c2336ac8fe68"),
? ? ("EndStation","977abb69-413a-4ccf-a109-0272c24fd490"),
? ? ("SearchDate","2018/04/11"),
? ? ("SearchTime","19:00"),
? ? ("SearchWay","DepartureInMandarin")
])
req.add_header("Origin","http://www.thsrc.com.tw")
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36")#模擬瀏覽器行為
resp = request.urlopen(req,data=postData.encode("utf-8"))
print(resp.read().decode("UTF-8"))