技術(shù)路線:
python+requests+re
代碼如下:
import requests
from bs4 import BeautifulSoup
import re
def getHTMLcode(url,data):
try:
r = requests.get(url, headers=data)
print(r.status_code)
r.raise_for_status()
print(r.apparent_encoding)
r.encoding = r.apparent_encoding
print(r.encoding)
return r.text,r.encoding
except:
print('爬取失敗')
def parsePage(contain,html):
soup = BeautifulSoup(html, "html.parser")
# find_all( name , attrs , recursive , text , **kwargs )
items = soup.find_all(name='div', class_='article')
for item in items:
print(item)
print('-------------------------------')
item = str(item)
a = []
# 發(fā)布人,發(fā)布內(nèi)容,發(fā)布時(shí)間,點(diǎn)贊數(shù)
pattern = re.compile(
r'<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<i class="number">(.*?)</i>', re.S)
groups = re.findall(pattern, item) #groups是以元組為元素的列表
for g in groups: #g是元組
a.append(g[0])
a.append(g[1])
a.append(g[2])
contain.append(a)
def saveArticle(contain,filPath):
for article in contain:
with open(filPath,'a+',encoding='utf-8') as f:
temp='作者:'+article[0].strip('\n')+'\n'+'內(nèi)容:\n'+article[1].strip('\n').replace('<br/>', '')+'\n點(diǎn)贊數(shù):'+article[2].strip('\n')+'\n\n\n\n'
f.write(temp)
def spyder(url,data,depth,filPath):
for i in range(depth):
url=url+str(depth+11)
html,encoding=getHTMLcode(url, data)
if encoding=='ISO-8859-2':
continue
contain=[]
parsePage(contain, html)
saveArticle(contain,filPath)
if __name__=="__main__":
depth =10
url = 'https://www.qiushibaike.com/hot/page/'
data = {'User-Agent': 'Mozilla/5.0'}
filPath='newarticle.docx'
spyder(url,data,depth,filPath)
關(guān)鍵點(diǎn):
正則表達(dá)式的編寫:
pattern = re.compile(
r'<div class="author clearfix">.?<h2>(.?)</h2>.?<span>(.?)</span>.?<i class="number">(.?)</i>', re.S)
說明:
1:(.*?)代表分組懒鉴,用正則的方法re.findall(pattern, item)每個(gè)匹配的字符串里面我們先要的部分會以元組的形式返回冻记,然后一篇文章里有多組匹配的字符串的話最終結(jié)果會返回以元組為元素的列表锭环。
2:re.S代表.包括匹配換行