一、各章小說鏈接爬取
1涧卵、章節(jié)URL:https://book.qidian.com/info/1012053141#Catalog
2勤家、經(jīng)過審查元素各章節(jié)名稱和鏈接如下圖
3、可以通過如下方法獲取章節(jié)名和各章節(jié)鏈接
def get_download_chapter_url (self):
# 用于爬取https
context = ssl._create_unverified_context()
response = urllib.request.urlopen(self.server, context = context)
text = str(response.read(), encoding = 'utf-8')
bf = BeautifulSoup(text, 'html.parser')
div = bf.find_all('div', class_ = 'volume')
for i in range(len(div)):
li = div[i].find_all('li')
for j in range(len(li)):
a = li[j].find_all('a')
html = BeautifulSoup(str(a), 'lxml')
# 章節(jié)url
self.chapterUrls.append('https:' + str(html.a.get('href')))
# 章節(jié)名
self.chapterNames.append(html.a.string)
print(self.chapterUrls)
print(self.chapterNames)
4柳恐、運(yùn)行一下效果如下
二伐脖、爬取所有章節(jié)內(nèi)容,并保存到文件中
1乐设、爬取章節(jié)內(nèi)容
可以看出讼庇,小說章節(jié)內(nèi)容全在某個div下,可通過下面方法爬取
def get_contents (self, target):
context = ssl._create_unverified_context()
response = urllib.request.urlopen(target, context = context)
text = str(response.read(), encoding = 'utf-8')
bf = BeautifulSoup(text, 'html.parser')
txts = bf.find_all('div', class_='read-content j_readContent')
txts = txts[0].text.replace('\xa0'*8, '\n\n')
return txts
2近尚、將文章輸出txt
def write (self, name, path, txt):
write_flag = True
with open(path, 'a', encoding = 'utf-8') as f:
f.write(name + '\n')
f.writelines(txt)
f.write('\n\n')
三蠕啄、完整代碼和效果
1、完整代碼
#戈锻!/usr/bin/env Python3
# -*- coding:UTF-8 -*-
import urllib.request
import string, ssl, sys
from bs4 import BeautifulSoup
class DownloadNovel():
def __init__ (self, server):
self.server = server
# 存放章節(jié)名
self.chapterNames = []
# 存放章節(jié)鏈接
self.chapterUrls = []
# 存放章節(jié)數(shù)
self.nums = 0
def get_download_chapter_url (self):
# 用于爬取https
context = ssl._create_unverified_context()
response = urllib.request.urlopen(self.server, context = context)
text = str(response.read(), encoding = 'utf-8')
bf = BeautifulSoup(text, 'html.parser')
div = bf.find_all('div', class_ = 'volume')
for i in range(len(div)):
li = div[i].find_all('li')
for j in range(len(li)):
a = li[j].find_all('a')
html = BeautifulSoup(str(a), 'lxml')
# 章節(jié)url
self.chapterUrls.append('https:' + str(html.a.get('href')))
# 章節(jié)名
self.chapterNames.append(html.a.string)
def get_contents (self, target):
context = ssl._create_unverified_context()
response = urllib.request.urlopen(target, context = context)
text = str(response.read(), encoding = 'utf-8')
bf = BeautifulSoup(text, 'html.parser')
txts = bf.find_all('div', class_='read-content j_readContent')
txts = txts[0].text.replace('\xa0'*8, '\n\n')
return txts
def write (self, name, path, txt):
write_flag = True
with open(path, 'a', encoding = 'utf-8') as f:
f.write(name + '\n')
f.writelines(txt)
f.write('\n\n')
if __name__ == '__main__':
dl = DownloadNovel('https://book.qidian.com/info/1012053141#Catalog')
dl.get_download_chapter_url()
print('開始下載')
for i in range(len(dl.chapterNames)):
dl.write(dl.chapterNames[i], '無限火力大暴走.txt', dl.get_contents(dl.chapterUrls[i]))
sys.stdout.write(" 已下載:%.3f%%" % float(i/len(dl.chapterNames)) + '\r')
sys.stdout.flush()
print('下載完成')
2歼跟、效果
源碼鏈接