學習python不久,但是還是想分享一些自己的想法或者思路轴脐,一起成長,下面是我的今日分享抡砂,希望大家持續(xù)關注大咱,嘻嘻
兩個基本庫:urllib和requests。一注益、urllib的使用碴巾。
import?urllib.request
response =?urllib.request.urlopen('https://www.python.org')#獲得HTTP response類型對象#print(response.read().decode('utf-8')) #網(wǎng)頁源代碼print(response.getheader('Server'))#獲取響應頭信息的Server值
from?urllib import?parse
url =?'http://httpbin.org/post'
headers =?{
'User-Agent':?'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
'Host':'httpbin.org'}#請求頭,修改user-agent聊浅,偽裝為瀏覽器dict?=?{
'name':?'Germey'}
data =?bytes(parse.urlencode(dict),encoding='utf8')#轉字流采用bytes方法
req =?urllib.request.Request(url=url,data=data,headers=headers,method='POST')#返回Request類型對象
response =?urllib.request.urlopen(req)print(response.read().decode('utf-8'))
#需要驗證身份的網(wǎng)頁from?urllib.request import?HTTPPasswordMgrWithDefaultRealm,HTTPBasicAuthHandler,build_opener
username =?'username'
password =?'password'
url =?'http://localhost:5000/'
p =?HTTPPasswordMgrWithDefaultRealm()
p.add_password(None,url,username,password)?#利用add_password添加用戶名及密碼餐抢,建立處理驗證的handler
auth_handler =?HTTPBasicAuthHandler(p)?#實例化HTTPBasicAuthHandler對象
opener =?build_opener(auth_handler)?#利用上述handler構建opener
try:
????result =?opener.open(url)?#利用opener發(fā)送請求,打開鏈接
????html =?result.read().decode('utf-8')
????print(html)except?urllib.error.URLError as?a:
print('handler錯誤原因:'+str(a.reason))??#虛擬例子低匙,肯定不能打開哈旷痕,我們在這里返回原因
#使用代理from?urllib.request import?ProxyHandler
proxy_handler =?ProxyHandler({
'http':'http://127.0.0.1:9743',
'https':'https://127.0.0.1:9743'
})???#鍵名為協(xié)議類型,鍵值為代理鏈接顽冶,可添加多個連接,此代理運行在9743端口上
opener =?build_opener(proxy_handler)?#構造opener
try:????
????response =?opener.open('https:baidu.com')?#打開鏈接
????print(response.read().decode('utf-8'))except?urllib.error.URLError as?e:
print('代理錯誤原因:'+str(e.reason))
#cookies的處理import?http.cookiejar
cookie =?http.cookiejar.CookieJar()??????????????????#聲明Cookiejar對象
handler =?urllib.request.HTTPCookieProcessor(cookie)?#建立handler
opener =?urllib.request.build_opener(handler)????????#構建opener
response =?opener.open('http://www.baidu.com')???????#發(fā)送請求for?item in?cookie:
print(item.name+'='+item.value)
#輸出為文件格式如下演示:
filename =?'cookie.txt'
cookie =?http.cookiejar.MozillaCookieJar(filename)????#Cookiejar的子類欺抗,處理cookies和文件相關事務
handler =?urllib.request.HTTPCookieProcessor(cookie)?#建立handler
opener =?urllib.request.build_opener(handler)????????#構建opener
response =?opener.open('http://www.baidu.com')???????#發(fā)送請求
cookie.save(ignore_discard=True,ignore_expires=True)#第二種文件格式
filename =?'cookie_LWP.txt'
cookie =?http.cookiejar.LWPCookieJar(filename)????#Cookiejar的子類,保存為libwww-perl(LWP)格式
handler =?urllib.request.HTTPCookieProcessor(cookie)?#建立handler
opener =?urllib.request.build_opener(handler)????????#構建opener
response =?opener.open('http://www.baidu.com')???????#發(fā)送請求
cookie.save(ignore_discard=True,ignore_expires=True)
#有效讀取文件
filename =?'cookie_LWP.txt'
cookie =?http.cookiejar.LWPCookieJar()????
cookie.load(filename,ignore_discard=True,ignore_expires=True)
handler =?urllib.request.HTTPCookieProcessor(cookie)?#建立handler
opener =?urllib.request.build_opener(handler)????????#構建opener
response =?opener.open('http://www.baidu.com')???????#發(fā)送請求#print(response.read().decode('utf-8')) ???????#獲取百度網(wǎng)頁的源代碼
#解析鏈接强重,實現(xiàn)鏈接各部分的抽取绞呈,合并以及鏈接轉換from?urllib.parse import?urlparse
result =?urlparse('http://www.baidu.com/index.html;user?id=5#comment')print(str('result的類型為:'),end='')print(type(result))print(result)??????#6個部分,scheme協(xié)議间景,netloc域名佃声,path訪問路徑,params參數(shù)倘要,query查詢條件圾亏,fragment錨點,定位頁面內部的下拉位置
#urlprase的另一種用法
result =?urlparse('www.baidu.com/index.html;user?id=5#comment',scheme =?'https')#注意封拧,scheme參數(shù)只有在url中不包含scheme時才生效print(str('result的類型為:'),end='')print(type(result))print(result)
#對立方法urlunparse志鹃,接受可迭代對象,但參數(shù)長度必須為6from?urllib.parse import?urlunparse
data =?['https','www.baidu.com','index.html','user','a=6','comment']print(urlunparse(data))
#urlsplit與urlparse類似泽西,但并不解析params部分曹铃,將其返回到path部分from?urllib.parse import?urlsplit
result =?urlsplit('https://www.baidu.com/index.html;user?a=6#comment')print(result)#同樣,urlsplit也有對立方法urlunsplit方法#urljoin方法捧杉,實現(xiàn)鏈接的更新陕见,補充,雙參數(shù)from?urllib.parse import?urljoin
print(urljoin('http://www.baidu.com/about.html','https://pangzixi.com/FAQ.html?question=2'))print(urljoin('http://www.baidu.com/about.html','http://www.baidu.com/about.html?question=2'))
#urlencode構造GET請求參數(shù)(序列化),及其反序列化from?urllib.parse import?urlencode,parse_qs,parse_qsl
params =?{
'name':'germey',
'age':22}#首先聲明一個字典秘血,將參數(shù)表示出來
base_url =?'http://www.baidu.com?'
url =??base_url +?urlencode(params)?#調用urlencode方法,將其轉化為請求參數(shù)print(str('序列化:')+str(url))
query =?'name=germey&age=22'print(str('反序列化:')+str(parse_qs(query)))print(str('反序列化(轉化為元組列表):')+str(parse_qsl(query)))?
#quote方法淳玩,防止中文出現(xiàn)在url中時發(fā)生亂碼現(xiàn)象from?urllib.parse import?quote,unquote
keyword =?'張三'
url =?'http://www.baidu.com/s?wd='?+?quote(keyword)?#quote方法對中文進行UTF編碼print(url)
url =?'http://www.baidu.com/s?wd=%E5%A3%81%E7%BA%B8'?print(unquote(url))?#unquote實現(xiàn)解碼
#分析Robots協(xié)議,協(xié)議規(guī)定爬蟲或搜索引擎那些頁面允許抓取直撤,哪些不可以
'''
一般格式:
User-agent: * ?對所有爬蟲有效
Disallow: / ???不允許爬取所有網(wǎng)頁
Allow: /public/可以爬取public目錄
'''
from?urllib.robotparser import?RobotFileParser
rp =?RobotFileParser()#創(chuàng)建對象
rp.set_url('http://www.reibang.com/robots.txt')#設置robots.txt鏈接
rp.read()#返回可讀對象,另一種格式為:rp.parse(urlopen('http://www.reibang.com/robots.txt').read().decode('utf-8').splite('\n'))print(rp.can_fetch('*','http://www.jianshu,com/p/b67554025d7d'))#查看是否允許抓取True/False
二蜕着、requests的使用,相較于urllib庫红柱,明顯可以感覺requests的功能更強大而且方便承匣。
#requests庫能夠更加方便的實現(xiàn)cookies以及登錄驗證,代理設置等import?requests
r =?requests.get('https://www.baidu.com/')?#get方法發(fā)送請求print(r.cookies)print(r.status_code)?#返回狀態(tài)碼
#GET請求
r =?requests.get('http://httpbin.org/get')#print(r.text)
#附加額外信息
data =?{
'name':'germey',
'age':32}
r =?requests.get('http://httpbin.org/get',params=data)#print(r.text)
#網(wǎng)頁的返回類型是str類型锤悄,是JSON格式韧骗,可直接調用json()方法
#print(r.json())
#抓取二進制數(shù)據(jù)
r =?requests.get('https://www.github.com/favicon.ico')with?open('favicon.ico','wb')?as?fn:
fn.write(r.content)??#保存二進制文件
#添加headers
headers =?{
'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
r =?requests.get('https://www.zhihu.com/explore',headers=headers)#print(r.text)
#POST請求
r =?requests.post('http://httpbin.org/post',data=data)#print(r.text)
#requests內置狀態(tài)碼查詢對象if?r.status_code!=requests.codes.ok:
exit()else:
print('Request Successfully')
#requests庫的高級用法:文件上傳,Cookies設置零聚,代理設置#文件上傳袍暴,以上傳圖片為例
files =?{'file':open('favicon.ico','rb')}#保證py文件和ico文件在同一目錄下,否則請?zhí)顚懡^對路徑
r =?requests.post('http://httpbin.org/post',files=files)?#print(r.text)
#Cookies設置
r =?requests.get('https://www.baidu.com')for?key,value in?r.cookies.items():
print(key +?'='?+?value)
#會話維持Session
s =?requests.Session()
s.get('http://httpbin.org/cookies/set/number/123456')??#設置cookie的名稱number隶症,內容123456
r =?s.get('http://httpbin.org/cookies')?#獲取該網(wǎng)頁的cookiesprint(r.text)
#SSL證書驗證from?requests.packages import?urllib3
urllib3.disable_warnings()#屏蔽指定證書的警告信息
response =?requests.get('https://www.12306.cn',verify=False)#verify=False避免SSLError錯誤信息print(response.status_code)
#代理設置政模,針對大規(guī)模爬取可能出現(xiàn)的封IP現(xiàn)象#用到proxies參數(shù)
proxies =?{
'http':'http://10.10.1.10:3128',
'https':'https://10.10.1.10:1080'}#一個例子,這是一個無效的代理
requests.get('https://www.taobao.com',proxies=proxies)
#超時設置timeout
r =?requests('https://www.taobao.com',timeout=1)#1秒內無響應蚂会,就拋出異常#永久等待淋样,可將timeout設置為None,或者不加timeout參數(shù)
#身份認證,遇到認證頁面的操作
r =?requests.get('http://lpcalhost:5000',auth=('username','password'))#username和password為舉例說明胁住,請輸入你的真實信息以登錄#或者采取OAuth1認證的方法趁猴,比較難受from?requests_oauthlib import?OAuth1
url =?'https://api.twitter.com/...'
auth =?OAuth1('yourappkey','yourappsecret','useroauthtoken','useroauthtokensecret')
requests.get(url,auth)
#Prepared Request數(shù)據(jù)結構from?requests import?Request,Session
url =?'http://httpbin.org/post'
data =?{'name':'germey'}
headers =?{
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'}#此處復制百度頁面的代理即可
s =?Session()
request =?Request('POST',url,data=data,headers=headers)
prepped =?s.prepare_request(request)#封裝Prepared Request對象
r =?s.send(prepped)?
pint(r.text)?
版本為3.9.0。初入爬蟲領域彪见,歡迎交流評論儡司。本人都是通過代碼課堂學習的届谈,大家可以一起來哦