requests是什么扁耐?urllib是什么暇检?模擬http發(fā)送請(qǐng)求,可以發(fā)送get婉称、post块仆。可以定制請(qǐng)求頭王暗、可以攜帶參數(shù)悔据,可以使用代理,可以使用cookie
requests功能和urllib一樣俗壹,只不過(guò)是第三方的科汗,用起來(lái)很簡(jiǎn)潔,接口人性化
安裝:
pip install requests
文檔
get绷雏,帶參數(shù)
r = requests.get(url, params=data)
r.text 獲取字符串格式的文本內(nèi)容
r.content 獲取字節(jié)格式的文本內(nèi)容
r.status_code 獲取狀態(tài)碼
r.headers 獲取響應(yīng)頭部
r.url 請(qǐng)求的url
r.encoding 可以讀取和設(shè)置編碼格式
import requests
url = 'https://www.baidu.com/s?'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
data = {
'ie': 'utf-8',
'wd': '天海翼'
}
r = requests.get(url, headers=headers, params=data)
'''
學(xué)習(xí)響應(yīng)對(duì)象的屬性和方法
r.text 獲取字符串格式的文本內(nèi)容
r.content 獲取字節(jié)格式的文本內(nèi)容
r.status_code 獲取狀態(tài)碼
r.headers 獲取響應(yīng)頭部
r.url 請(qǐng)求的url
r.encoding 可以讀取和設(shè)置編碼格式
'''
# r.encoding = 'gbk'
# print(r.text)
# print(r.content)
# print(r.encoding)
with open('tian.html', 'wb') as fp:
fp.write(r.content)
post头滔,帶參數(shù)
r = requests.post(url=url, data=formdata)
import requests
url = 'https://cn.bing.com/ttranslationlookup?&IG=DF7E5AE6CE974F879A127BDEB8927608&IID=translator.5036.9'
formdata = {
'from': 'zh-CHS',
'to': 'en',
'text': '香蕉',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
r = requests.post(url=url, headers=headers, data=formdata)
# print(r.text)
print(type(r.json()))
ajax接口
r.json() : 相當(dāng)于首先得到j(luò)son格式字符串,然后通過(guò)json.loads將其轉(zhuǎn)化為python對(duì)象
使用代理
r = requests.get(url=url, headers=headers, proxies=proxy)
import requests
proxy = {
'http': '218.60.8.99:3129'
}
url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
r = requests.get(url=url, headers=headers, proxies=proxy)
with open('ip.html', 'wb') as fp:
fp.write(r.content)
使用cookie
s = requests.Session()
s.get()
s.post() 就會(huì)自動(dòng)的攜帶和保存cookie
import requests
# 創(chuàng)建一個(gè)會(huì)話
s = requests.Session()
# 再往下涎显,所有的請(qǐng)求拙毫,都使用s.get s.post方法發(fā)送,那么就會(huì)自動(dòng)保存和攜帶cookie
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
# 首先向這個(gè)地址發(fā)送post請(qǐng)求
post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018931119824'
formdata = {
'email': '17701256561',
'icode': '',
'origURL': 'http://www.renren.com/home',
'domain': 'renren.com',
'key_id': '1',
'captcha_type': 'web_login',
'password': '0ecca3193d71e76959033bac6ecd009210c0b90ed14f35e2c8cdfbe512c83986',
'rkey': 'cb15f985754fd884a44506ff5db1256e',
'f': 'http%3A%2F%2Fwww.renren.com%2F960481378',
}
r = s.post(post_url, headers=headers, data=formdata)
print(r.text)
# 通過(guò)代碼如何保存和攜帶cookie
# 如何訪問(wèn)登錄后的頁(yè)面
info_url = 'http://www.renren.com/960481378/profile'
r = s.get(info_url, headers=headers)
with open('info.html', 'wb') as fp:
fp.write(r.content)
異常處理
如下異常都在這個(gè)模塊中 requests.exceptions
ConnectionError 就是以前的URLError
HTTPError 就是以前的HTTPError
Timeout 超時(shí)異常
import requests
import requests.exceptions
# url = 'http://www.baidu.com/'
# try:
# r = requests.get(url, timeout=0.001)
# except requests.exceptions.Timeout as e:
# print(e)
r = requests.get('https://www.12306.cn', verify=False)
忽略證書(shū)
r = requests.get('https://www.12306.cn', verify=False)
eg:爬取北京公交
爬取全國(guó)
import requests
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
url = 'http://js.8684.cn/citys/city_boxInf.min.js'
r = requests.get(url=url,headers=headers)
content = r.text
string = content.split('=')[-1].rstrip(';')
# print(string)
pattern = re.compile(r'([a-z]+):')
ret = pattern.findall(string)
print(len(ret))
for i in ret:
print(i)
正則提取之前
8.24.png
提取之后
.17.png
import requests
from bs4 import BeautifulSoup
import json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
def parse_first_page(url):
r = requests.get(url=url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# 得到所有的以數(shù)字開(kāi)頭的鏈接
number_a_list = soup.select('.bus_kt_r1 > a')
# 得到所有的以字母開(kāi)頭的鏈接
char_a_list = soup.select('.bus_kt_r2 > a')
all_a_list = number_a_list + char_a_list
all_href_list = []
# 得到所有a對(duì)象的href屬性
for oa in all_a_list:
href = url.rstrip('/') + oa['href']
all_href_list.append(href)
# print(len(char_a_list))
return all_href_list
def parse_second_page(url, nchref):
r = requests.get(url=nchref, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# 獲取所有的以某某開(kāi)頭的鏈接
a_list = soup.select('#con_site_1 > a')
# 獲取得到所有的公交鏈接
href_list = []
for oa in a_list:
href = url.rstrip('/') + oa['href']
href_list.append(href)
return href_list
def parse_third_page(href, fp):
r = requests.get(url=href, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# 獲取線路名稱
route_name = soup.select('.bus_i_t1 > h1')[0].string
print('正在爬取---%s---......' % route_name)
# 獲取運(yùn)行時(shí)間
run_time = soup.select('.bus_i_content > p')[0].string
# 獲取票價(jià)信息
piao_info = soup.select('.bus_i_content > p')[1].string
# 獲取公交公司
company = soup.select('.bus_i_content > p > a')[0].string
# 獲取更新時(shí)間
update_time = soup.select('.bus_i_content > p')[3].string.lstrip('最后更新:')
# 獲取路線長(zhǎng)度
try:
route_length = soup.select('.bus_label > p')[0].string.strip('全程公里棺禾。')
except Exception as e:
route_length = '沒(méi)有長(zhǎng)度'
# 獲取上行總站個(gè)數(shù)
up_total = soup.select('.bus_line_top > span')[0].string.strip('共站').strip()
# 獲取上行站牌
up_site_name_list = []
# 獲取得到所有站牌
all_a_list = soup.select('.bus_line_site > .bus_site_layer > div > a')
# 獲取得到上行所有站牌
up_a_list = all_a_list[:int(up_total)]
for oa in up_a_list:
up_site_name_list.append(oa.string)
try:
# 獲取下行總站個(gè)數(shù)
down_total = soup.select('.bus_line_top > span')[1].string.strip('共站').strip()
# 獲取下行站牌
down_a_list = all_a_list[int(up_total):]
down_site_name_list = []
for oa in down_a_list:
down_site_name_list.append(oa.string)
except Exception as e:
down_total = '沒(méi)有下行'
down_site_name_list = []
# 將信息保存到字典中
item = {
'線路名稱': route_name,
'運(yùn)行時(shí)間': run_time,
'票價(jià)信息': piao_info,
'公交公司': company,
'更新時(shí)間': update_time,
'線路長(zhǎng)度': route_length,
'上行站個(gè)數(shù)': up_total,
'上行站牌': up_site_name_list,
'下行站個(gè)數(shù)': down_total,
'下行站牌': down_site_name_list,
}
string = json.dumps(item, ensure_ascii=False)
fp.write(string + '\n')
print('結(jié)束爬取---%s---' % route_name)
def main():
fp = open('北京公交.txt', 'w', encoding='utf8')
url = 'http://beijing.8684.cn/'
# 向一級(jí)頁(yè)面發(fā)送請(qǐng)求缀蹄,得到所有的以數(shù)字開(kāi)頭或者字母開(kāi)頭列表
number_char_href_list = parse_first_page(url)
# 遍歷這個(gè)列表,依次向每個(gè)url發(fā)送請(qǐng)求
for nchref in number_char_href_list:
# 向二級(jí)頁(yè)面,發(fā)送請(qǐng)求缺前,得到所有的以1 2 3開(kāi)頭的所有公交url
href_list = parse_second_page(url, nchref)
# 遍歷所有的href_list蛀醉,依次向每個(gè)公交url發(fā)送請(qǐng)求,挨個(gè)解析
for href in href_list:
parse_third_page(href, fp)
fp.close()
if __name__ == '__main__':
main()
80.39.png