如果你對(duì)美食無(wú)法抵御的話,那么請(qǐng)執(zhí)行一下代碼吧凡涩!海量美食---家常菜
1.多任務(wù)線程池進(jìn)行爬取數(shù)據(jù),高效率完成
2.創(chuàng)建線程池將url和執(zhí)行的函數(shù)名提交也就是pool.submit(函數(shù)名,url)
3.使用xpath進(jìn)行數(shù)據(jù)的獲取
4.存儲(chǔ)數(shù)據(jù)為json文件,存在本地
from concurrent.futures import ThreadPoolExecutor
import requests
from requests import exceptions
from lxml import etree
import json
import threading
#導(dǎo)入線程池模塊
def crawlPageDate(url,**kwargs):
print(url,**kwargs)
# fullurl = 'https://www.meishij.net/chufang/diy/jiangchangcaipu/?&page=1' + str(pagenum)
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
}
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
print('請(qǐng)求成功')
html =response.text
#將獲取的頁(yè)面源代碼放入數(shù)據(jù)隊(duì)列
return html,200
except exceptions.HTTPError as err:
print(err)
except exceptions.ConnectTimeout as err:
print(err)
except exceptions.RequestException as err:
print(err)
return None,404
def done(futures):
# print('123')
print(futures)
html,status = futures.result()
print(status)
# 解析數(shù)據(jù),實(shí)例化一個(gè)xpath對(duì)象
if html:
x_html = etree.HTML(html)
caipu_list = x_html.xpath('//div[@class="listtyle1"]')
for cai_div in caipu_list:
# 封面圖
item = {}
item['coverImage'] = cai_div.xpath('//img[@class="img"]/@src')[0]
item['type'] = cai_div.xpath('.//a/strong[@class="gx"]/span/text()')
if len(item['type']) > 0:
item['type'] = item['type'][0]
else:
item['type'] = '暫無(wú)'
item['title'] = cai_div.xpath('.//div[@class="c1"]/strong/text()')[0]
print(item)
lock.acquire()
with open('cai.json', 'a') as file:
json_str = json.dumps(item, ensure_ascii=False) + '\n'
file.write(json_str)
lock.release()
if __name__ == '__main__':
#創(chuàng)建線程池
pool = ThreadPoolExecutor(max_workers=1889999999999999999999999999)
for page in range(1,57):
#向線程池中提交任務(wù)
'''
fn:要執(zhí)行的任務(wù)蹋偏,*args:要傳遞的參數(shù),**kwargs要傳遞的多個(gè)參數(shù):
'''
url = 'https://www.meishij.net/chufang/diy/jiangchangcaipu/?&page=' + str(page)
result = pool.submit(crawlPageDate,url)
#callback里面要添加方法名稱(chēng)
result.add_done_callback(done)
lock = threading.Lock()