Python 爬取簡書個人文章目錄呐赡、查看數(shù)及鏈接地址
1. 通過360極速瀏覽器的審查元素,選“Network”->“XHR”選項,滾動頁面,找出連接地址構(gòu)成的規(guī)律。http://www.reibang.com/u/55b597320c4e?order_by=shared_at&page=2
如下圖:
2. 根據(jù)文件數(shù)和每頁顯示的數(shù)量履肃,構(gòu)建鏈接地址。
urls =[ 'http://www.reibang.com/u/55b597320c4e?order_by=shared_at&page={}'.format(str(i)) for i in range(1,13)]
3. 使用LXML庫坐桩,查找需要的標(biāo)題尺棋,查看量,超鏈地址绵跷。
代碼如下:
# -*- coding: utf-8 -*-
import? requests,time
from lxml import? etree
import pymongo
from multiprocessing import Pool? #多線程庫
client = pymongo.MongoClient('localhost',27017)
mydb = client['mydb']
jianshu_user_dy = mydb['jianshu_user_dy']
headers = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Referer': 'http://www.reibang.com/u/9104ebf5e177'
}
def get_infos(url):
try:
html = requests.get(url,headers =headers)
selector = etree.HTML(html.text)
try:
links = selector.xpath('//*[@id="list-container"]/ul/li')
for link in links:
title = link.xpath('div/a/text()')[0]
view = link.xpath('div/div/a[1]/text()')[-1].strip()
title_url ='http://www.reibang.com'+ link.xpath('div/a/@href')[0]
print(title,view)
infos = {
'title':title,
'url':title_url,
'view':view
}
jianshu_user_dy.insert_one(infos)
except:
print("抓取不到內(nèi)容咯膘螟??抖坪?萍鲸??擦俐?脊阴??蚯瞧?嘿期??埋合?备徐??甚颂?")
except requests.ConnectionError:
print("網(wǎng)頁出錯啦蜜猾!***************")
urls =[ 'http://www.reibang.com/u/55b597320c4e?order_by=shared_at&page={}'.format(str(i)) for i? in range(1,13)]
if __name__ == '__main__':
start = time.time()
pool = Pool(processes=4)
pool.map(get_infos,urls)
print("合計用時:{}".format(str(time.time()-start)))