總體思路
使用多線程爬蟲可以提高爬取和儲存的速度拆撼,雖然python中的線程是假的,但對于io操作來說喘沿,多線程是起作用的闸度。
總體思路用生產者與消費者的模型來設計。
- 將要爬取的url放入urlQUeue的隊列中
- 負責爬取網頁信息的工人(線程)蚜印,從url隊列獲取url莺禁,進行請求,把爬取的網頁信息放入一個dataQueue的隊列中窄赋。
- 負責解析的工人哟冬,從dataQueue中獲取網頁信息,進行解析后忆绰,存儲浩峡。
# coding=utf8
import urllib
import urllib2
from lxml import etree
import json
from threading import Thread
from Queue import Queue
CRAWL_EXIT = False
PARSE_EXIT = False
class ThreadCrawl(Thread):
def __init__(self, threadName, pageQueue, dataQueue):
super(ThreadCrawl, self).__init__()
self.pageQueue = pageQueue
self.dataQueue = dataQueue
self.threadName = threadName
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
def run(self):
print '%s啟動' %self.threadName
while not CRAWL_EXIT:
try:
# 默認block為true,當隊列空時堵塞错敢,直到有新的元素加入隊列
page = self.pageQueue.get()
url = 'https://www.qiushibaike.com/text/page/%d/' % page
request = urllib2.Request(url, headers=self.headers)
response = urllib2.urlopen(request).read()
self.dataQueue.put(response)
except:
pass
class ThreadParse(Thread):
def __init__(self, parseName, dataQueue, fileName):
super(ThreadParse, self).__init__()
self.dataQueue = dataQueue
self.parseName = parseName
self.fileName = fileName
def run(self):
print '%s啟動' %self.parseName
while not PARSE_EXIT:
try:
html = self.dataQueue.get(False)
self.parse(html)
except:
pass
def parse(self, html):
text = etree.HTML(html)
# 創(chuàng)建 模糊查詢的根節(jié)點翰灾,包含每條段子的全部信息
node_list = text.xpath('//div[contains(@id,"qiushi_tag")]')
items = {}
for node in node_list:
# 內容,取出標簽下的內容 第一個標簽 text
content = node.xpath('.//div[@class="content"]/span')[0].text
# 用戶名
try:
username = node.xpath('./div[1]/a[2]/h2')[0].text
except:
print '沒有用戶'
items = {'username': username,
'content': content}
self.fileName.write(json.dumps(items, ensure_ascii=False).encode('utf8') + '\n')
def main():
# 頁碼隊列
pageQueue = Queue(10)
for i in range(1, 11):
pageQueue.put(i)
# 表示采集好的html源碼隊列
dataQueue = Queue()
crawlList = ['采集線程一號', '采集線程二號', '采集線程三號']
# 啟動三個采集線程
thread_carwl = []
for tname in crawlList:
thread = ThreadCrawl(tname, pageQueue, dataQueue)
thread.start()
thread_carwl.append(thread)
praseList = ['解析線程一號', '解析線程二號', '解析線程三號']
prase_thread = []
fileName = open('duanzi.json', 'a')
for tname in praseList:
thread = ThreadParse(tname, dataQueue, fileName)
thread.start()
prase_thread.append(thread)
# 頁碼對列不為空時
while not pageQueue.empty():
pass
global CRAWL_EXIT
CRAWL_EXIT = True
while not dataQueue.empty():
pass
global PARSE_EXIT
PARSE_EXIT = True
# 主線程堵塞,等待采集線程完成
for thread in thread_carwl:
thread.join()
print('采集完成')
for thread in prase_thread:
thread.join()
print('寫入完成')
fileName.close()
if __name__ == '__main__':
main()