這是第一個(gè)成功獲取到的版本兔沃,下個(gè)版本預(yù)計(jì)添加一個(gè)字典,將書(shū)名與url放入级及,方便用戶(hù)直接通過(guò)搜索書(shū)名來(lái)直接下載乒疏。目前還沒(méi)想好書(shū)名錯(cuò)誤的情況,源碼如下:
# !/usr/bin/env python
# -*- coding:utf-8 -*-
#文件 :Module_10_24_novel.py
# author named sunxth
#IDE pycharm
import os
from urllib import request
from bs4 import BeautifulSoup
import jsonpath
if __name__ == '__main__':
url = "https://www.biqukan.com/40_40243/"
file = open("有妖氣客棧.txt",'w',encoding = 'utf-8')
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/63.0'
}
req = request.Request(url = url,headers = headers)
html = request.urlopen(url)
html = html.read().decode("gbk","ignore")
#在解碼的時(shí)候饮焦,設(shè)置ignore 會(huì)自動(dòng)忽略非法字符
soup = BeautifulSoup(html,'lxml')
#find find_all == []
listmain_soup = soup.find_all('div',class_= "listmain")
chapter_text = BeautifulSoup(str(listmain_soup),'lxml')
number = (len(chapter_text.dl.contents)-1)/2-12
index = 1
print("計(jì)算的章節(jié)個(gè)數(shù)",number)
begin_flag = False
#遍歷dl 標(biāo)簽下面的 父標(biāo)簽下一級(jí)
for child in chapter_text.dl.children:
if child != "\n":
#過(guò)濾回車(chē)
if child.string == u"《有妖氣客楃止停》正文卷":
begin_flag = True
if begin_flag == True and child.a != None:
download_url = 'https://www.biqukan.com/' + child.a.get('href')
download_url_req = request.Request(url = download_url,headers = headers)
download_url_reponse = request.urlopen(download_url)
#解碼
download_url_html = download_url_reponse.read().decode("gbk","ignore")
#獲得各個(gè)章節(jié)的名稱(chēng)
download_name = child.string
soup_texts = BeautifulSoup(download_url_html,'lxml')
texts = soup_texts.find_all(id = "content",class_ = "showtxt")
soup_text = BeautifulSoup(str(texts),'lxml')
file.write(download_name + "\n\n")
#將各個(gè)章節(jié)的名稱(chēng)寫(xiě)入文本
for each in soup_text.div.text:
file.write(each)
print("下載完成的章節(jié):",download_name)
#每遍歷完成一個(gè)章節(jié)入偷,換行以保證格式
file.write('\n')
file.close()
print('completed')