import threading
from queue import Queue
import time
from lxml import etree
import requests
import json
class CrawlThread(threading.Thread):
def __init__(self, name, page_queue, data_queue):
super().__init__()
self.name = name
# 保存頁碼隊列
self.page_queue = page_queue
self.data_queue = data_queue
# url
self.url = 'http://www.fanjian.net/duanzi-{}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
def run(self):
# 這里面的思路是什么?
while 1:
if self.page_queue.empty():
break
# 1端姚、從頁碼隊列中獲取頁碼
page = self.page_queue.get()
# 2涩嚣、將url和頁碼進行拼接
url = self.url.format(page)
# 3屋剑、發(fā)送請求,獲取響應(yīng)
r = requests.get(url=url, headers=self.headers)
time.sleep(1)
# 4玄妈、將響應(yīng)內(nèi)容放入到數(shù)據(jù)隊列中
self.data_queue.put(r.text)
class ParseThread(threading.Thread):
def __init__(self, name, data_queue, lock, fp):
super().__init__()
self.name = name
# 保存數(shù)據(jù)隊列
self.data_queue = data_queue
self.lock = lock
self.fp = fp
def run(self):
# 解析線程解析步驟
while 1:
if self.data_queue.empty():
break
# 1、從數(shù)據(jù)隊列中取出一個數(shù)據(jù)
content = self.data_queue.get()
# 2、解析這個數(shù)據(jù)
items = self.parse_content(content)
# 3谆棱、寫入到文件中
string = json.dumps(items, ensure_ascii=False)
# 加鎖
print(self.fp,'類似會發(fā)士大夫')
self.lock.acquire()
self.fp.write(string)
# 釋放鎖
self.lock.release()
# 解析數(shù)據(jù)函數(shù)
def parse_content(content):
# 生成tree對象
tree = etree.HTML(content)
# 先找到所有的li標簽
li_list = tree.xpath('//li[@class="cont-item"]')
items = []
for oli in li_list:
# 獲取頭像
face = oli.xpath('.//div[@class="cont-list-reward"]//img/@data-src')[0]
# 獲取名字
name = oli.xpath('.//div[@class="cont-list-head"]/a/text()')[0]
# 獲取內(nèi)容
text = oli.xpath('.//div[@class="cont-list-main"]/p/text()')[0]
# 獲取時間
shijian = oli.xpath('.//div[@class="cont-list-info fc-gray"]/text()')[-1]
item = {
'頭像': face,
'名字': name,
'內(nèi)容': text,
'時間': shijian,
}
# 將字典添加到列表中
items.append(item)
return items
def create_queue():
page_queue = Queue()
data_queue = Queue()
# 向頁碼隊列中添加頁碼
for page in range(1, 11):
page_queue.put(page)
return page_queue, data_queue
def main():
# 做什么?
# 創(chuàng)建鎖
lock = threading.Lock()
# 打開文件
fp = open('duanzi.txt', 'w', encoding='utf8')
# 創(chuàng)建兩個隊列
page_queue, data_queue = create_queue()
# 創(chuàng)建采集圆仔、解析線程
crawlname_list = ['采集線程1', '采集線程2', '采集線程3']
parsename_list = ['解析線程1', '解析線程2', '解析線程3']
# 列表垃瞧,用來保存所有的采集線程和解析線程
t_crawl_list = []
t_parse_list = []
for crawlname in crawlname_list:
t_crawl = CrawlThread(crawlname, page_queue, data_queue)
t_crawl.start()
# 將對應(yīng)的采集線程保存起來
t_crawl_list.append(t_crawl)
for parsename in parsename_list:
t_parse = ParseThread(parsename, data_queue, lock, fp)
# 將對應(yīng)的解析線程保存起來
t_parse_list.append(t_parse)
t_parse.start()
# 讓主線程等待子線程結(jié)束之后再結(jié)束
for t_crawl in t_crawl_list:
t_crawl.join()
for t_parse in t_parse_list:
t_parse.join()
fp.close()
if __name__ == '__main__':
main()
為什么里面沒有寫數(shù)據(jù)呢?