import threading
from queue import Queue
import time
from lxml import etree
import requests
import json
# 判斷解析線程何時(shí)退出的標(biāo)記位
g_parse_flag = True
class CrawlThread(threading.Thread):
def __init__(self, name, page_queue, data_queue):
super().__init__()
self.name = name
# 保存頁碼隊(duì)列
self.page_queue = page_queue
self.data_queue = data_queue
# url
self.url = 'http://www.fanjian.net/duanzi-{}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
def run(self):
print('%s線程開始啟動(dòng)' % self.name)
# 這里面的思路是什么茫叭?
while 1:
if self.page_queue.empty():
break
# 1、從頁碼隊(duì)列中獲取頁碼
page = self.page_queue.get()
# 2、將url和頁碼進(jìn)行拼接
url = self.url.format(page)
# 3蛉抓、發(fā)送請(qǐng)求诬垂,獲取響應(yīng)
r = requests.get(url=url, headers=self.headers)
time.sleep(1)
# 4量九、將響應(yīng)內(nèi)容放入到數(shù)據(jù)隊(duì)列中
self.data_queue.put(r.text)
print('%s線程結(jié)束' % self.name)
class ParseThread(threading.Thread):
def __init__(self, name, data_queue, lock, fp):
super().__init__()
self.name = name
# 保存數(shù)據(jù)隊(duì)列
self.data_queue = data_queue
self.lock = lock
self.fp = fp
def run(self):
# time.sleep(3)
print('%s線程開始啟動(dòng)' % self.name)
# 解析線程解析步驟
while 1:
# 1捆愁、從數(shù)據(jù)隊(duì)列中取出一個(gè)數(shù)據(jù)
content = self.data_queue.get()
# 2燕刻、解析這個(gè)數(shù)據(jù)
items = self.parse_content(content)
# 3饿凛、寫入到文件中
string = json.dumps(items, ensure_ascii=False)
# 加鎖
self.lock.acquire()
self.fp.write(string + '====\n')
# 釋放鎖
self.lock.release()
time.sleep(2)
if g_parse_flag == False:
break
print('%s線程結(jié)束' % self.name)
# 解析數(shù)據(jù)函數(shù)
def parse_content(self, content):
# 生成tree對(duì)象
tree = etree.HTML(content)
# 先找到所有的li標(biāo)簽
li_list = tree.xpath('//li[@class="cont-item"]')
items = []
for oli in li_list:
# 獲取頭像
face = oli.xpath('.//div[@class="cont-list-reward"]//img/@data-src')[0]
# 獲取名字
name = oli.xpath('.//div[@class="cont-list-head"]/a/text()')[0]
# 獲取內(nèi)容
text = oli.xpath('.//div[@class="cont-list-main"]/p/text()')[0]
# 獲取時(shí)間
shijian = oli.xpath('.//div[@class="cont-list-info fc-gray"]/text()')[-1]
item = {
'頭像': face,
'名字': name,
'內(nèi)容': text,
'時(shí)間': shijian,
}
# 將字典添加到列表中
items.append(item)
return items
def create_queue():
page_queue = Queue()
data_queue = Queue()
# 向頁碼隊(duì)列中添加頁碼
for page in range(1, 11):
page_queue.put(page)
return page_queue, data_queue
def main():
# 做什么狞玛?
# 創(chuàng)建鎖
lock = threading.Lock()
# 打開文件
fp = open('duanzi.txt', 'w', encoding='utf8')
# 創(chuàng)建兩個(gè)隊(duì)列
page_queue, data_queue = create_queue()
# 創(chuàng)建采集、解析線程
crawlname_list = ['采集線程1', '采集線程2', '采集線程3']
parsename_list = ['解析線程1', '解析線程2', '解析線程3']
# 列表涧窒,用來保存所有的采集線程和解析線程
t_crawl_list = []
t_parse_list = []
for crawlname in crawlname_list:
t_crawl = CrawlThread(crawlname, page_queue, data_queue)
t_crawl.start()
# 將對(duì)應(yīng)的采集線程保存起來
t_crawl_list.append(t_crawl)
for parsename in parsename_list:
t_parse = ParseThread(parsename, data_queue, lock, fp)
# 將對(duì)應(yīng)的解析線程保存起來
t_parse_list.append(t_parse)
t_parse.start()
# 一直在判斷解析線程何時(shí)推出
while 1:
if page_queue.empty():
break
time.sleep(3)
while 1:
if data_queue.empty():
global g_parse_flag
g_parse_flag = False
break
# 讓主線程等待子線程結(jié)束之后再結(jié)束
for t_crawl in t_crawl_list:
t_crawl.join()
for t_parse in t_parse_list:
t_parse.join()
fp.close()
print('主線程心肪、子線程全部結(jié)束')
if __name__ == '__main__':
main()
# 留給大家了,為什么里面沒有寫數(shù)據(jù)呢纠吴?