目標(biāo):爬取任意貼吧下前50頁(yè),并保存到本地
觀察貼吧網(wǎng)頁(yè)邏輯類型
1.前兩張圖片分別是LOL吧首頁(yè)量窘,李毅吧的首頁(yè),對(duì)比發(fā)現(xiàn):
當(dāng)我們搜索不同的貼吧時(shí),我們看到只有網(wǎng)址kw后邊的參數(shù)有變化辐棒,此處參數(shù)表示不同的吧名。
2.后兩張圖片分別是LOL吧第二頁(yè)和第三頁(yè)的信息牍蜂,對(duì)比發(fā)現(xiàn):
代表頁(yè)碼的參數(shù)是pn后邊的參數(shù)漾根,以50的倍數(shù)增加。
編輯代碼
import requests
class TiebaSpider:
def __init__(self,tieba_name):
self.tieba_name = tieba_name
self.url_temp = "https://tieba.baidu.com/f?kw="+tieba_name+"&ie=utf-8&pn={}"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0"}
def get_url_list(self):
url_list = []
for i in range(1000):
url_list.append(self.url_temp.format(i*50))
return url_list
def parse_url(self,url): #發(fā)送請(qǐng)求鲫竞,獲取回應(yīng)
response = requests.get(url,headers=self.headers)
return response.content.decode()
def save_html(self,html_str,page_num): #保存HTML字符串
file_path = "{}-第{}頁(yè).html".format(self.tieba_name,page_num)
with open("", "")as f: #"lol-第x頁(yè).html"
f.write(html_str)
def run(self): #實(shí)現(xiàn)邏輯
#1.構(gòu)造url列表
url_list = self.get_url_list()
#2.遍歷辐怕,發(fā)送請(qǐng)求,獲取響應(yīng)
for url in url_list:
html_str = self.parse_url(url)
#3.保存
#3.保存
page_num = url_list.index(url)+i #頁(yè)碼數(shù)
self.save_html(html_str,page_num)
if__name___ == '__name__':
tieba_spider = TiebaSpider("lol")
tieba_spider.run()