前言
- 蛋肥學(xué)習(xí)了如何提升爬蟲(chóng)速度扎筒,打算分別嘗試單線程爬蟲(chóng)、多線程爬蟲(chóng)奥溺、多進(jìn)程爬蟲(chóng)浮定、多協(xié)程爬蟲(chóng)來(lái)進(jìn)行數(shù)據(jù)抓取桦卒,并對(duì)比其實(shí)際抓取速度匿又。
準(zhǔn)備
爬取時(shí)間:2021/03/10
系統(tǒng)環(huán)境:Windows 10
所用工具:Jupyter Notebook\Python 3.0
涉及的庫(kù):requests\lxml\selenium\time\threading\queue\multiprocessing\gevent\sys
獲取網(wǎng)址信息
優(yōu)設(shè)導(dǎo)航
https://hao.uisdc.com/
import requests
from lxml import etree
def getinfo(xpath):
url="https://hao.uisdc.com/"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0"}
r=requests.get(url,headers=headers,timeout=10)
html=etree.HTML(r.text)
info=html.xpath(xpath)
return(info)
link=getinfo('//div[@class="item"]/a/@href')
title=getinfo('//div[@class="item"]/a/h3/text()')
獲取網(wǎng)頁(yè)截圖
單線程爬蟲(chóng)
from selenium import webdriver
import time
def getshot(url,name):
driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
driver.maximize_window()
start=time.time()
for i in range(len(url)):
try:
driver.get(url[i])
#等待頁(yè)面加載完成
time.sleep(1)
driver.save_screenshot(r"C:\Users\Archer\Desktop\網(wǎng)頁(yè)截圖\img"+name[i]+".png")
except:
continue
end=time.time()
print("單線程爬蟲(chóng)所用時(shí)間:",end-start)
getshot(link,title)
多線程爬蟲(chóng)
參考資料
Python多線程
import threading
import time
import queue as Queue
from selenium import webdriver
start=time.time()
#截圖函數(shù)迎吵,設(shè)置get的超時(shí),以防一直取不到卡死
def getshot(name,url):
url=url.get(timeout=2)
picname=name+" "+str(time.time())
driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
driver.maximize_window()
try:
driver.get(url)
#等待頁(yè)面加載完成
time.sleep(1)
driver.save_screenshot(r"C:\Users\Archer\Desktop\網(wǎng)頁(yè)截圖\img"+picname+".png")
driver.quit()
except:
print(name+"出錯(cuò)")
class myThread(threading.Thread):
def __init__(self,name,url):
threading.Thread.__init__(self)
self.name=name
self.url=url
def run(self):
while True:
try:
getshot(self.name,self.url)
except:
break
threadlist=["Thread-1","Thread-2","Thread-3","Thread-4","Thread-5"]
workQueue=Queue.Queue(200)
threads=[]
#創(chuàng)建新線程
for tName in threadlist:
thread=myThread(tName,workQueue)
thread.start()
threads.append(thread)
#填充隊(duì)列
for i in range(len(link)):
workQueue.put(link[i])
#等待所有線程完成
for t in threads:
t.join()
end=time.time()
print("Queue多線程爬蟲(chóng)所用時(shí)間:",end-start)
多進(jìn)程爬蟲(chóng)
參考資料
多進(jìn)程在運(yùn)行的時(shí)候只有一個(gè)子進(jìn)程會(huì)運(yùn)行,怎么解決
用python進(jìn)行多進(jìn)程編程時(shí)蔫巩,只有主進(jìn)程可以運(yùn)行,子進(jìn)程貌似沒(méi)有運(yùn)行是什么原因
面試總結(jié)垃瞧,多進(jìn)程和多線程的區(qū)別
#如果CPU是單核个从,就無(wú)法進(jìn)行多進(jìn)程并行嗦锐,需要先了解計(jì)算機(jī)CPU的核心數(shù)量
from multiprocessing import cpu_count
print(cpu_count()) #蛋肥的電腦是8核
#Windows 以下代碼需寫成.py文件沪曙,然后用cmd啟動(dòng)(蛋肥用的Anaconda Powershell Prompt)
from multiprocessing import Process,Queue
import time
from selenium import webdriver
start=time.time()
#截圖函數(shù)液走,設(shè)置get的超時(shí),以防一直取不到卡死
def getshot(name,url):
url=url.get(timeout=2)
picname=name+" "+str(time.time())
driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
driver.maximize_window()
try:
driver.get(url)
#等待頁(yè)面加載完成
time.sleep(1)
driver.save_screenshot(r"C:\Users\Archer\Desktop\網(wǎng)頁(yè)截圖\img"+picname+".png")
driver.quit()
except:
print(name+"出錯(cuò)")
class myProcess(Process):
def __init__(self,name,url):
Process.__init__(self)
self.name=name
self.url=url
def run(self):
while True:
try:
print(self.name)
getshot(self.name,self.url)
except:
break
#要寫if嘱根,具體原因還沒(méi)完全搞懂
if __name__=="__main__":
processlist=["Process-1","Process-2","Process-3","Process-4","Process-5"]
workQueue=Queue(200)
processes=[]
#填充隊(duì)列
for i in range(len(link)):
workQueue.put(link[i])
#創(chuàng)建新進(jìn)程
for pName in processlist:
process=myProcess(pName,workQueue)
processes.append(process)
for t in processes:
t.start()
for t in processes:
t.join()
end=time.time()
print("Queue多進(jìn)程爬蟲(chóng)所用時(shí)間:",end-start)
多協(xié)程爬蟲(chóng)
參考資料
Python中g(shù)event模塊使用及出現(xiàn)MonkeyPatchWarning
Python的最大遞歸深度錯(cuò)誤maximum recursion depth exceeded while calling a Python object
#monkey必須放在最前面儿子,必須在獲取網(wǎng)址信息代碼(requests)的前面
import gevent
from gevent import monkey
monkey.patch_all()
#設(shè)置最大遞歸深度限制
import sys
sys.setrecursionlimit(1000000)
from gevent.queue import Queue,Empty
import time
from selenium import webdriver
start=time.time()
#截圖函數(shù),設(shè)置get的超時(shí)割岛,以防一直取不到卡死
def getshot(index):
while not workQueue.empty():
url=workQueue.get(timeout=2)
picname="Process-"+str(index)+str(time.time())
driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
driver.maximize_window()
try:
driver.get(url)
#等待頁(yè)面加載完成
time.sleep(1)
driver.save_screenshot(r"C:\Users\Archer\Desktop\網(wǎng)頁(yè)截圖\img"+picname+".png")
driver.quit()
except:
print("出錯(cuò)")
def boss():
#填充隊(duì)列
for i in range(len(link)):
workQueue.put_nowait(link[i])
if __name__=="__main__":
workQueue=Queue(10000)
gevent.spawn(boss).join()
jobs=[]
for i in range(5):
jobs.append(gevent.spawn(getshot,i))
gevent.joinall(jobs)
end=time.time()
print("Queue多協(xié)程爬蟲(chóng)所用時(shí)間:",end-start)
爬取結(jié)果
進(jìn)一步學(xué)習(xí)
總結(jié)
- 可通過(guò)多線程惠爽、多進(jìn)程、多協(xié)程的方式提升數(shù)據(jù)爬取的速度婚肆,但需合理選擇數(shù)量,一味地增加可能會(huì)適得其反用僧。