爬取的是整個(gè)筆趣閣的多個(gè)類型下的小說毅臊,分四層進(jìn)行爬取理茎,并存儲(chǔ)在mysql數(shù)據(jù)庫(kù)中,為網(wǎng)站提供數(shù)據(jù)源
代碼如下:
import re
import urllib.request
# 數(shù)據(jù)庫(kù)的操作
import pymysql
class Sql(object):
db = pymysql.connect(host="localhost", port=3306, db="novel", user="root", password="root", charset="utf8")
print('連接上了!')
def addnovel(self, sort_id, sort_name, bookname, imgurl, description, status, author):
cur = self.db.cursor()
cur.execute(
'insert into novel(booktype,sortname,name,imgurl,description,status,author) values("%s","%s","%s","%s","%s","%s","%s")' % (
sort_id, sort_name, bookname, imgurl, description, status, author))
lastrowid = cur.lastrowid
cur.close()
self.db.commit()
return lastrowid
def addchapter(self, lastrowid, chaptname, content):
cur = self.db.cursor()
cur.execute('insert into chapter(novelid,title,content)value("%s","%s","%s")' % (lastrowid, chaptname, content))
cur.close()
self.db.commit()
#
#
mysql = Sql()
def type(): # 獲取小說類型
html = urllib.request.urlopen("https://www.duquanben.com/").read()
html = html.decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
reg = r'fn-left(.*?)subnav'
html = re.findall(reg, html)
for i in html:
html = re.findall(r'book(.*?)/0/1/">(.*?)</a>', i)
for sort_id, sort_name in html:
getList(sort_id, sort_name)
def getList(sort_id, sort_name): # 獲取書的鏈接
html = urllib.request.urlopen('https://www.duquanben.com/book%s/0/1/' % sort_id).read().decode('gbk')
# print(html)
reg = r'<li>.*?href="(.*?)" target=".*?">.*?</a></li>'
urlList = re.findall(reg, html)
for url in urlList:
# print(urlList)
Novel(url, sort_id, sort_name)
def Novel(url, sort_id, sort_name):
html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r',
'').replace(
'<br />', '').replace(' ', '')
# print(html)
chapturl, bookname = re.findall(
'投票推薦</span></a></span><span class="btopt"><a href="(.*?)" title="(.*?)" target="_blank"><span>開始閱讀</span></a></span>',
html)[0]
description = re.findall(r'內(nèi)容簡(jiǎn)介.*?intro.*?>(.*?)</div>', html)[0]
imgurl = re.findall(r'<img src="(.*?)" alt=".*?', html)[0]
status = re.findall(r'float:right">(.*?)</div>', html)[0]
author = re.findall(r'作者:(.*?) 最新章節(jié)', html)[0]
# print(chapturl, bookname, description, imgurl, status, author)
lastrowid = mysql.addnovel(sort_id, sort_name,bookname, imgurl, description, status,author)
lastrowid=1
print(chapturl, bookname,status, author)
print("*"*100)
chaptList(chapturl, sort_id, sort_name, lastrowid,bookname)
def chaptList(chapturl,sort_id, sort_name, lastrowid, bookname): # 獲取了章節(jié)的部分鏈接和章節(jié)的名字
html = urllib.request.urlopen(chapturl).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
# print(html)
reg = r'mulu_list(.*?)show_index3'
chapthtml = re.findall(reg, html)
for chapt in chapthtml:
chaptlist = re.findall(r'<li><a href="(.*?)">(.*?)</a></li>', chapt, re.S)
# print(chaptlist)
for url1, chaptname in chaptlist:
# print(bookname+"中的"+chaptname +"爬取結(jié)束")
chaptcontent(url1, chapturl, lastrowid, chaptname)
def chaptcontent(url1, chapturl, lastrowid, chaptname):
url = chapturl + url1
# print(url)
html = urllib.request.urlopen(url).read().decode('gbk').replace('\n', '').replace('\t', '').replace('\r', '')
reg = r'class="contentbox">(.*?)<div class="ad00">'
content = re.findall(reg, html)[0].replace('<br />', '').replace(' ', '').replace('>', '').replace('<','').replace(
'[..]', '').replace('-a', '').replace('/a ', '')
# print(content)
mysql.addchapter(lastrowid, chaptname, content)
type()
在mysql可視乎界面中新建了兩個(gè)表
CREATE TABLE `chapter` (
`id` int(20) NOT NULL AUTO_INCREMENT,
`novelid` varchar(50) DEFAULT NULL,
`title` varchar(50) DEFAULT NULL,
`content` longtext,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=103 DEFAULT CHARSET=utf8;
CREATE TABLE `novel` (
`id` int(50) NOT NULL AUTO_INCREMENT,
`booktype` varchar(20) DEFAULT NULL,
`sortname` varchar(50) DEFAULT NULL,
`name` varchar(50) DEFAULT NULL,
`imgurl` varchar(50) DEFAULT NULL,
`description` varchar(200) DEFAULT NULL,
`status` varchar(50) DEFAULT NULL,
`author` varchar(20) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8;
學(xué)習(xí)網(wǎng)站:b站
爬取結(jié)果如下:
![~FN7FKIP0T]FTAZUJLOR58.png
![9E@55TKMW%G0[R40$_526.png](https://upload-images.jianshu.io/upload_images/11616627-2e189951152a475d.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/540)