import requests
from bs4 import BeautifulSoup
import re
#爬取個人簡書首頁文章
articleList=[] #用于保存所寫的文章
articleInfo={}
jianshu='www.reibang.com'
try:
r=requests.get('http://www.reibang.com/u/16d377e2ed69') #獲得了一個Response對象纱新。更改入口時,修改此處鏈接
print(r.url)
except Exception as e:
print("The exception is {}".format(e))
soup=BeautifulSoup(r.text,'lxml')
# artList=soup.select('ul > li') #獲取ul下的直接子節(jié)點li
artList=soup.select('ul[class=note-list]')
# print(len(artList)) #判斷出獲取成功
# print(type(artList))
artList=artList[0] #轉(zhuǎn)換為Tag類型
# print(type(artList)) #觀察類型
f=open('test.txt','a')
YM=re.compile(r'\d{4}-\d{2}-\d{2}')
HM=re.compile(r'\d\d:\d\d:\d\d')
lenTitle=[]
for article in artList.find_all('li'):
title=article.find_all('a',class_='title')[0].text
time=article.select('span[class=time]')[0]['data-shared-at']
url=article.find_all('a',class_='title')[0]['href']
getYM=re.search(YM,time)
getHM=re.search(HM,time)
finish_time=getYM.group()+' '+getHM.group()
lenTitle.append(len(title))
print('標(biāo)題: %30s 完成時間:%s 地址為: %s%s' % (title,finish_time,jianshu,url),file=f)
下次版本修改:
1.終端輸入用戶名追迟,完成爬取
2.保存至csv文件
3.動態(tài)加載網(wǎng)頁的處理(這個......)