背景:
極客學(xué)院網(wǎng)站,課程標(biāo)簽下有個(gè)知識(shí)體系圖岳守。因?yàn)轶w系比較全,因此沖了一個(gè)月vip碌冶,想要將系列課程下載下來以后慢慢看湿痢,也就有了下面的爬蟲。(僅供個(gè)人學(xué)習(xí),請勿商用扑庞,侵刪)
技術(shù)文檔:
想要學(xué)習(xí)python爬蟲譬重,當(dāng)然少不了request用戶指南、Beautiful Soup技術(shù)文檔
代碼塊:
- config.py 爬蟲基礎(chǔ)的設(shè)置
proxies = {
'https': '42.123.125.181:8088',
}
headers = {
#這里填寫自己的cookie
'Cookie': '_uab_collina=xxxxx; PHPSESSID=xxxxx; jkxyid_v2=xxxx; _ga=xxxx; _gid=xxxx; gr_user_id=xxxx; uname=xxxxx; uid=xxxx; code=xxxx; authcode=xxxxx',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
KnowledgeSystemUrl = 'https://www.jikexueyuan.com/path/'
為避免網(wǎng)站的反爬機(jī)制罐氨,對本地ip的訪問限制臀规,這里添加了proxies 代理服務(wù)器地址,可以在國內(nèi)高匿代理獲取代理服務(wù)器栅隐;
下載部分視頻時(shí)候需要用戶vip登陸塔嬉,這里獲取瀏覽器訪問極客學(xué)院的request header玩徊,使用其中的cookie來跳過登陸步驟查看http請求的header信息;
- crawl.py 獲取html
import requests
from config import headers
from config import proxies
class Crawl(object):
def getText(self,url):
try:
r = requests.get(url,headers = headers,proxies = proxies,timeout = 20)
r.encoding = r.apparent_encoding
print (r.status_code,'request')
self.html = r.text
return r.text
except:
return 'getText error'
def getResponse(self, url):
try:
r = requests.get(url, headers = headers,proxies = proxies,timeout = 20)
r.encoding = r.apparent_encoding
print (r.status_code,'request')
return r
except:
return 'getResponse error'
- KnowledgeSystem.py 獲取知識(shí)體系列表
from config import KnowledgeSystemUrl
from crawl import Crawl
from bs4 import BeautifulSoup
class KnowledgeSystem(Crawl):
class listData():
nameList = []
srcList = []
def getList(self):
try:
html = self.getText(KnowledgeSystemUrl)
soup = BeautifulSoup(html,'html.parser')
print('----正在查找知識(shí)體系列表----')
srcList = []
nameList = []
index = 1
cf = soup.find_all(attrs = 'pathlist-one cf')
for member in cf:
h2 = member.find('h2')
print('%d '%(index) + h2.string)
nameList.append(h2.string)
srcList.append('https:' + member['href'])
index = index + 1
ld = self.listData()
ld.nameList = nameList
ld.srcList = srcList
return ld
except:
print('getList error')
def sellect(self):
n = input('-----請輸入你想要下載的課程號(hào)-----\n')
return int(n)
- courseList.py 獲取每個(gè)章節(jié)的課程列表
from crawl import Crawl
from bs4 import BeautifulSoup
class CourseList(Crawl):
class CourseData:
chapterName = ''
lessonNameList = []
lessonSrcList = []
class CourseList:
#存放CourseData類
chapterList = []
def getCourse(self,url):
print('-------正在獲取該系列課程信息---------')
chapterListHtml = self.getText(url)
chapterListSoup = BeautifulSoup(chapterListHtml,'html.parser')
temp = chapterListSoup.find_all(attrs = 'pathstage mar-t30')
self.CourseList.chapterList = []
for each in temp:
#獲取該系列的章節(jié)名字谨究,存放在CourseData類中
CD = self.CourseData()
CD.chapterName = each.find('h2').string
lessonInfoList = each.find_all(attrs = 'lesson-info-h2')
index = 1
#初始化課程名列表恩袱、url源列表
CD.lessonNameList = []
CD.lessonSrcList = []
for info in lessonInfoList:
#獲取課程名字,存放在CourseData類中的名字列表中
courseName = str(index) + '.' + info.string
CD.lessonNameList.append(courseName)
#獲取課程名字胶哲,存放在CourseData類中的url列表中
lessonSrc = 'https:'+ info.a['href']
CD.lessonSrcList.append(lessonSrc)
index = index + 1
#將處理好的課程數(shù)據(jù)類保存在chapterList中
self.CourseList.chapterList.append(CD)
def printChapterNameList(self):
print('-----查找到該知識(shí)體系有如下章節(jié):-----')
for each in self.CourseList.chapterList:
print(each.chapterName)
def printLessonNameList(self):
index = 0
for each in self.CourseList.chapterList:
for lessonName in each.lessonNameList:
print(lessonName)
index = index + 1
def printLessonSrcList(self):
index = 0
for each in self.CourseList.chapterList:
for lessonSrc in each.lessonSrcList:
print(lessonSrc)
index = index + 1
- section.py 獲取每一課程的小節(jié)信息
from crawl import Crawl
from bs4 import BeautifulSoup
import bs4
class Section(Crawl):
class SectionData:
sectionNameList = []
sectionSrcList = []
def getSection(self,url):
print('--------正在獲取該知識(shí)體系的小節(jié)信息-------')
lessonHtml = self.getText(url)
soup = BeautifulSoup(lessonHtml,'html.parser')
temp = soup.find(attrs='lessonvideo-list')
while(isinstance(temp,bs4.element.Tag) == False):
lessonHtml = self.getText(url)
soup = BeautifulSoup(lessonHtml,'html.parser')
print('isinstance(temp,bs4.element.Tag) == False')
temp = soup.find(attrs='lessonvideo-list')
aTag = temp.find_all('a')
self.SectionData.sectionNameList = []
self.SectionData.sectionSrcList = []
for each in aTag:
#print(each.string)
#print('https:' + each['href'])
self.SectionData.sectionNameList.append(each.string)
self.SectionData.sectionSrcList.append('https:' + each['href'])
- download.py 下載視頻
from crawl import Crawl
from section import Section
from bs4 import BeautifulSoup
import bs4
import os
import requests
class Download(Crawl):
class DownloadData:
sourceList = []
nameList = []
def findVideoSrc(self,SectionData):
print('-----正在獲取課程的視頻鏈接-------')
self.DownloadData.sourceList = []
self.DownloadData.nameList = SectionData.sectionNameList
for Src in SectionData.sectionSrcList:
html = self.getText(Src)
soup = BeautifulSoup(html,'html.parser')
sourceTag = soup.find('source')
while(isinstance(sourceTag,bs4.element.Tag) == False):
print('isinstance(sourceTag,bs4.element.Tag) == False')
html = self.getText(Src)
soup = BeautifulSoup(html,'html.parser')
sourceTag = soup.find('source')
source = sourceTag['src']
#print(source)
self.DownloadData.sourceList.append(source)
def makeDir(self,dirName):
print('-------正在創(chuàng)建路徑:%s------'%dirName)
try:
if(os.path.exists(dirName)):
return dirName
else:
os.mkdir(dirName)
return dirName
except:
print('當(dāng)前要?jiǎng)?chuàng)建的路徑為:'+ dirName)
dirName = input('創(chuàng)建失敗畔塔,請手動(dòng)輸入路徑')
dirName = self.makeDir(dirName)
return dirName
def saveVideoFile(self,path,videoName,videoSrc):
videoFilePath = path +'/'+ videoName + '.mp4'
if(os.path.exists(videoFilePath)):
print(' ' + '視頻已存在。 %s'%(videoName))
return
else:
video = requests.get(videoSrc)
print(' ' + '開始下載視頻 %s'%(videoName))
f = open(videoFilePath, 'ab')
print(' ' + '開始保存視頻 %s'%(videoName))
f.write(video.content)
f.close()
def downloadVideo(self,path):
path = self.makeDir(path)
for i in range(len(self.DownloadData.sourceList)):
self.saveVideoFile(path,self.DownloadData.nameList[i],self.DownloadData.sourceList[i])
- main.py 主函數(shù)
import sys
import io
from KnowledgeSystem import KnowledgeSystem
from courseList import CourseList
from section import Section
from download import Download
import os
#sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
if __name__ == '__main__':
#聲明類
KS = KnowledgeSystem()
KSLD = KS.listData()
#獲取知識(shí)體系列表
KSLD = KS.getList()
#用戶選擇想要下載的某一條知識(shí)體系
num = KS.sellect()
#該體系的名字鸯屿、該體系的url源
ksName = KSLD.nameList[num - 1]
ksSrc = KSLD.srcList[num - 1]
#獲取該體系所有的課程
CL = CourseList()
CL.getCourse(ksSrc)
CL.printChapterNameList()
sec = Section()
dld = Download()
pathTemp = './'+ksName
pathTemp = dld.makeDir('./'+ksName)#./andorid
for each in CL.CourseList.chapterList:
pathTemp2 = dld.makeDir(pathTemp + '/' + each.chapterName)#./andorid/1.環(huán)境搭建
for i in range(len(each.lessonSrcList)):
path = pathTemp2 + '/' + each.lessonNameList[i] #./andorid/1.環(huán)境搭建/1.Android 集成開發(fā)環(huán)境搭建
sec.getSection(each.lessonSrcList[i])
videoFilePath = path +'/'+ sec.SectionData.sectionNameList[len(sec.SectionData.sectionNameList)-1] + '.mp4'
if(os.path.exists(videoFilePath)):
print('文件已存在澈吨,跳過 %s'%videoFilePath)
pass
else:
dld.findVideoSrc(sec.SectionData)
dld.downloadVideo(path)
print('download successful')
運(yùn)行結(jié)果預(yù)覽:
獲取想要的資源
正在下載視頻
下載好的視頻文件