爬取起點(diǎn)中文網(wǎng)的免費(fèi)圖書
17553828-c507eef2269e4b24.png
import requests, os
from multiprocessing import Pool
from bs4 import BeautifulSoup
class QidianSpider:
# 初始化
def __init__(self, pages, url, localPath):
self.pages = pages
self.url = url
self.localPath = localPath
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Connection': 'close'
}
# 一頁(yè)一頁(yè)的下載圖書,每頁(yè)有20本
def download_book(self):
self.create_folder()
for i in range(self.pages):
param = {
"orderId": '',
"vip": 'hidden',
"style": 1,
'pageSize': 20,
"siteid": 1,
"pubflag": 0,
"hiddenField": 1,
"page": i + 1
}
try:
# 訪問(wèn)每頁(yè)獲取的數(shù)據(jù)
data_responses = self.get_responses(param)
# 從每頁(yè)結(jié)果中獲取所有圖書信息佑女,后面根據(jù)ID獲取每本書的具體內(nèi)容
book_info_list = self.get_book_info(data_responses)
# 多進(jìn)程下載
self.multiprocess_download(book_info_list, 10)
except Exception as e:
print(e)
# 判斷文件夾是否存在,不存在創(chuàng)建文件夾
def create_folder(self):
if not os.path.exists(self.localPath):
try:
os.makedirs(self.localPath)
except Exception as e:
raise (e)
# 訪問(wèn)每頁(yè)獲取的數(shù)據(jù)
def get_responses(self, param):
try:
data_responses = requests.get(self.url, params=param, headers=self.headers)
return data_responses
except Exception as e:
print(e)
# 從每頁(yè)結(jié)果中獲取圖書信息藤抡,后面根據(jù)ID獲取每本書的具體內(nèi)容
def get_book_info(self, data_responses):
soup = BeautifulSoup(data_responses.text, 'lxml')
book_info_raw = soup.select('div.book-mid-info')
book_info_list = []
for book_info_raw_single in book_info_raw:
book_info_dict = dict()
book_info_dict["title"] = book_info_raw_single.select('h4 > a')[0].get_text()
book_info_dict["id"] = book_info_raw_single.select('h4 > a')[0].get('data-bid')
book_info_dict['author'] = book_info_raw_single.select('.name')[0].get_text()
book_info_list.append(book_info_dict)
return book_info_list
# 多進(jìn)程下載
def multiprocess_download(self, book_info_list, process):
pool = Pool(process)
for book_info_dict in book_info_list:
pool.apply_async(self.download_one, (book_info_dict,))
pool.close()
pool.join()
# 單個(gè)進(jìn)程下,下載圖書詳細(xì)信息
def download_one(self, book_info_dict):
if os.path.exists(self.localPath + book_info_dict["title"]):
print('exists:', self.localPath + book_info_dict["title"])
return
# 捕獲異常
try:
book_catalog_responses = requests.get("https://book.qidian.com/info/%s#Catalog" % book_info_dict["id"],
timeout=10, headers=self.headers)
if book_catalog_responses.status_code == 200:
print("當(dāng)前進(jìn)程ID:{}膊畴,圖書信息:{}".format(os.getpid(), book_info_dict))
self.get_book_catalog_url(book_catalog_responses, book_info_dict)
self.save_book_content(book_info_dict)
except Exception as e:
print("異常:{}".format(book_info_dict), e)
# 獲取目錄url
def get_book_catalog_url(self, book_catalog_responses, book_info_dict):
soup = BeautifulSoup(book_catalog_responses.text, 'html.parser')
book_catalog_info_raw = soup.select('.volume-wrap li[data-rid] a[href]')
book_catalog_url_list = []
for book_catalog_info_raw_single in book_catalog_info_raw:
book_catalog_url_list.append(book_catalog_info_raw_single['href'])
book_info_dict['bookCatalogUrl'] = book_catalog_url_list
# 獲取圖書內(nèi)容并保存
def save_book_content(self, book_info_dict):
with open(self.localPath + '{}.txt'.format(book_info_dict["title"]), 'w', encoding='utf-8') as f:
f.write(book_info_dict['title'] + '\n')
f.write("作者:" + book_info_dict['author'] + '\n')
for url in book_info_dict['bookCatalogUrl']:
try:
book_content_responses = requests.get("https:" + url, headers=self.headers)
if book_content_responses.status_code == 200:
soup = BeautifulSoup(book_content_responses.text, 'html.parser')
book_catalog = soup.find('h3', attrs={'class': 'j_chapterName'}).get_text()
f.write('\n' + book_catalog + '\n')
book_content = soup.find('div', attrs={'class': "read-content j_readContent"})
f.write('\t' + book_content.get_text() + '\n')
except Exception as e:
print('異常:{}章節(jié)獲取失敗'.format(book_info_dict['title']), e)
if __name__ == '__main__':
spider = QidianSpider(100, 'https://www.qidian.com/free/all', 'E://book//')
spider.download_book()