猜想:看廖雪峰python教程的時候我就在想盅粪,這上百篇的教程钓葫,到底有多少人能夠堅持到最后呢?然后就想到學(xué)完一定要把這種半途而廢的現(xiàn)象赤裸裸地展現(xiàn)出來票顾,哈哈哈础浮。結(jié)果請看圖片,不言自明奠骄,下降趨勢明顯豆同。
參考資料
- 這是一個人寫的爬蟲入門教程,我覺得很適合入門
- Python 爬蟲:把廖雪峰教程轉(zhuǎn)換成 PDF 電子書
- 《python編程:從入門到實踐》第15章開始有講怎么畫圖
步驟方法:
1含鳞、請詳細(xì)耐心看完以上的幾篇入門文章之后
2影锈、所有教程的鏈接在第一篇教程的左邊,我們需要獲取所有的鏈接蝉绷,這個在參考文章2里面有說到鸭廷,請查看。
3熔吗、閱讀量按F12辆床,然后Ctrl+F,輸入閱讀量的數(shù)字磁滚,發(fā)現(xiàn)這個數(shù)字在網(wǎng)頁中的位置佛吓,于是我們知道這在x-wiki-info這個class中,通過beautifulsoup的查找方法垂攘,我們可以得到這個標(biāo)簽维雇,之后就很容易得到這個數(shù)字了
4、最后是把數(shù)字以折線圖的形式畫出來晒他,請看參考文檔3
5吱型、之后學(xué)了異步之后改為異步請求多個網(wǎng)頁,看看效果怎么樣陨仅,見代碼2
40個網(wǎng)站在相同的網(wǎng)速之下異步和同步所花時間之比為2:3
6津滞、上個步驟容易導(dǎo)致下載的網(wǎng)速過快,然后就被封掉了灼伤,從這篇文章的最后教我們用協(xié)程的信號量來控制協(xié)程的個數(shù)触徐,這樣子就可以避免上述問題了。見代碼3
- 代碼1:同步執(zhí)行源碼:
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import time
import os
START_URL = 'https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
# 時間測量的裝飾器
def timing(f):
def wrap(*args):
time1_s = time.time()
ret = f(*args)
time2_s = time.time()
print('%s function took %0.3f s' % (f.__name__, (time2_s-time1_s)))
return ret
return wrap
@timing
def get_all_urls():
# 要模擬瀏覽器登陸
headers = {'User-Agent': USER_AGENT}
response = requests.get(START_URL,headers=headers)
# print(response.content.decode())
bsobj = BeautifulSoup(response.content, 'lxml')
urls = bsobj.find('ul',{'class':'uk-nav uk-nav-side','style':'margin-right:-15px;'}).find_all('li')
return urls
@timing
def ReadNum(url):
# print(url)
# 要模擬瀏覽器登陸
headers = {'User-Agent': USER_AGENT}
response = requests.get(url, headers=headers)
# print(response.content.decode())
soup = BeautifulSoup(response.content, 'lxml')
ReadInfo = soup.find(class_='x-wiki-info')
num = int(ReadInfo.span.string.split(':')[1])
return num
# print(num)
def main():
all_readInfo = []
urls = get_all_urls()
# ReadNum('https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000')
urls_num = len(urls)
for i,url in enumerate(urls):
# if i <= 5:
num = ReadNum('https://www.liaoxuefeng.com' + urls[i].a['href'])
if num is not None:
all_readInfo.append(num)
print('還剩下',urls_num - i)
# else:
# break
plt.plot(all_readInfo)
plt.show()
代碼2:異步執(zhí)行源碼狐赡,python版本要大于3.4
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import time
import aiohttp
import asyncio
START_URL = 'https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
READING_INFO = []
NUM_LEFT = 0
# 時間測量的裝飾器
def timing(f):
def wrap(*args):
time1_s = time.time()
ret = f(*args)
time2_s = time.time()
print('%s function took %0.3f s' % (f.__name__, (time2_s-time1_s)))
return ret
return wrap
@timing
def get_all_urls():
# 要模擬瀏覽器登陸
headers = {'User-Agent': USER_AGENT}
response = requests.get(START_URL,headers=headers)
# print(response.content.decode())
bsobj = BeautifulSoup(response.content, 'html.parser')
tags = bsobj.find('ul',{'class':'uk-nav uk-nav-side','style':'margin-right:-15px;'}).find_all('a')
urls = []
for tag in tags:
urls.append('https://www.liaoxuefeng.com' + tag['href'])
# print(urls)
return urls
def read_num(url):
# print(url)
# 要模擬瀏覽器登陸
headers = {'User-Agent': USER_AGENT}
response = requests.get(url, headers=headers)
# print(response.content.decode())
soup = BeautifulSoup(response.content, 'html.parser')
ReadInfo = soup.find(class_='x-wiki-info')
num = int(ReadInfo.span.string.split(':')[1])
return num
# print(num)
async def read_num_asyncio(url,index):
# 要模擬瀏覽器登陸
headers = {'User-Agent': USER_AGENT}
async with aiohttp.ClientSession() as client:
async with client.get(url, headers=headers) as resp:
assert resp.status == 200
txt = await resp.text()
# async with aiohttp.get(url, headers=headers) as resp:
# assert resp.status == 200
# txt = await resp.text()
# # print(txt)
soup = BeautifulSoup(txt,'html.parser')
read_info = soup.find(class_='x-wiki-info')
num = int(read_info.span.string.split(':')[1])
# print(num)
global NUM_LEFT,READING_INFO
READING_INFO[index] = num
NUM_LEFT += 1
print(NUM_LEFT)
# return num
# 異步方法讀取100+個教程網(wǎng)頁然后提取出閱讀量
def asyncio_get_reading_num(urls):
loop = asyncio.get_event_loop()
tasks = [read_num_asyncio(url,index) for index,url in enumerate(urls) if index < 20]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
@timing
def main():
urls = get_all_urls()
print(len(urls))
global READING_INFO
READING_INFO = [0] * len(urls)
asyncio_get_reading_num(urls)
# all_readInfo = []
# urls_num = len(urls)
# for i,url in enumerate(urls):
# if i <= 40:
# num = read_num(url)
# if num is not None:
# all_readInfo.append(num)
# print(i)
# else:
# break
# plt.plot(READING_INFO)
# plt.show()
if __name__ == '__main__':
main()
- 代碼3:用協(xié)程信號量來限制協(xié)程運行個數(shù)
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import time
import aiohttp
import asyncio
START_URL = 'https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
READING_INFO = []
NUM_LEFT = 0
# 時間測量的裝飾器
def timing(f):
def wrap(*args):
time1_s = time.time()
ret = f(*args)
time2_s = time.time()
print('%s function took %0.3f s' % (f.__name__, (time2_s-time1_s)))
return ret
return wrap
@timing
def get_all_urls():
# 要模擬瀏覽器登陸
headers = {'User-Agent': USER_AGENT}
response = requests.get(START_URL,headers=headers)
# print(response.content.decode())
bsobj = BeautifulSoup(response.content, 'html.parser')
tags = bsobj.find('ul',{'class':'uk-nav uk-nav-side','style':'margin-right:-15px;'}).find_all('a')
urls = []
for tag in tags:
urls.append('https://www.liaoxuefeng.com' + tag['href'])
# print(urls)
return urls
def read_num(url):
# print(url)
# 要模擬瀏覽器登陸
headers = {'User-Agent': USER_AGENT}
response = requests.get(url, headers=headers)
# print(response.content.decode())
soup = BeautifulSoup(response.content, 'html.parser')
ReadInfo = soup.find(class_='x-wiki-info')
num = int(ReadInfo.span.string.split(':')[1])
return num
# print(num)
async def read_num_asyncio(url,index,sem):
# 要模擬瀏覽器登陸
headers = {'User-Agent': USER_AGENT}
with (await sem):
async with aiohttp.ClientSession() as client:
async with client.get(url, headers=headers) as resp:
assert resp.status == 200
txt = await resp.text()
# async with aiohttp.get(url, headers=headers) as resp:
# assert resp.status == 200
# txt = await resp.text()
# # print(txt)
soup = BeautifulSoup(txt,'html.parser')
read_info = soup.find(class_='x-wiki-info')
num = int(read_info.span.string.split(':')[1])
# print(num)
global NUM_LEFT,READING_INFO
READING_INFO[index] = num
NUM_LEFT += 1
print(NUM_LEFT)
# return num
# 異步方法讀取100+個教程網(wǎng)頁然后提取出閱讀量
def asyncio_get_reading_num(urls):
# 設(shè)置線程的信號量撞鹉,最多5個協(xié)程在工作,根據(jù)網(wǎng)站的流量或者實際測試確定
# 如果沒有進(jìn)行限制,那么中途可能被封IP
sem = asyncio.Semaphore(5)
loop = asyncio.get_event_loop()
# tasks = [read_num_asyncio(url,index,sem) for index,url in enumerate(urls) if index < 20]
tasks = [read_num_asyncio(url, index, sem) for index, url in enumerate(urls)]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
@timing
def main():
urls = get_all_urls()
print(len(urls))
global READING_INFO
READING_INFO = [0] * len(urls)
asyncio_get_reading_num(urls)
# all_readInfo = []
# urls_num = len(urls)
# for i,url in enumerate(urls):
# if i <= 40:
# num = read_num(url)
# if num is not None:
# all_readInfo.append(num)
# print(i)
# else:
# break
plt.plot(READING_INFO)
plt.show()
if __name__ == '__main__':
main()