1.目標(biāo)
最近想找一些紀(jì)錄片電影來看辈灼,從知乎的文章中發(fā)現(xiàn)了紀(jì)錄片天地([http://www.jlpcn.net](http://www.jlpcn.net/))這個(gè)網(wǎng)站。網(wǎng)站內(nèi)的資源非常豐富,看過一些電影之后冷溃,有了批量下載電影的想法耘纱。
查看網(wǎng)頁的源代碼后發(fā)現(xiàn)電影的鏈接,分類,描述等信息比較規(guī)則擎宝,適合使用腳本爬取。
2.實(shí)現(xiàn)思路
2.1使用requests獲取網(wǎng)頁內(nèi)容
def getSoup(self, url):
'''使用request獲取網(wǎng)頁源代碼,并傳入bs'''
headers = {
'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
}
try:
r = requests.get(url, headers=headers)
r.encoding = 'utf-8'
soup = bsp(r.text, "html.parser")
return soup
except Exception as identifier:
print('getSoup ex:\n%s' % traceback.format_exc())
return None
2.2截取概要信息
以http://www.jlpcn.net/vodtypehtml/1.html頁面為例浑玛,這是“內(nèi)容分類”->“科普”類的第一頁绍申,總共23頁。
首先通過bs定位class='pages'的div元素顾彰,然后正則表達(dá)式提取總的頁面數(shù)极阅。
# 分析當(dāng)前分類的紀(jì)錄片有多少頁
def findAllLinks(self, url):
'''
紀(jì)錄片分類的地址第一頁
http://www.jlpcn.net/vodtypehtml/1.html
'''
links = []
links.append(url)
soup = self.getSoup(url)
if not soup:
return None
index1 = url.rfind('.')
base1 = url[0:index1]
div = soup.find('div', attrs={"class", "pages"})
if div:
pagestr = re.findall(r'當(dāng)前:1/(.+?)頁', div.text)
if len(pagestr) > 0:
try:
page_cnt = int(pagestr[0])
for x in range(2, page_cnt + 1):
url_t = "{0}-{1}.html".format(base1, x)
links.append(url_t)
except Exception as ex:
traceback.print_exc()
return links
提取頁面總數(shù)后,然后可分析每一頁總列舉的電影的概要信息
代碼如下
# 在分類頁面中查找當(dāng)前頁面的紀(jì)錄片的概要信息
def findMoives(self, url):
resultList = []
soup = self.getSoup(url)
if not soup:
return None
# print(soup.encode_contents())
li_list = soup.find_all('li', attrs={"class": "mov"})
for li in li_list:
imgbox = li.find('img', attrs={"class": "scrollLoading"})
if imgbox:
minfo = models.movie_summary()
minfo.img_url = imgbox["data-url"]
a_pic = li.find('a', attrs={"class": "pic"})
if a_pic:
minfo.href = a_pic["href"]
minfo.title = a_pic["title"]
minfo.title = minfo.title.replace(' ', '')
r1 = li.find('div', attrs={"class": "r1"})
minfo.update_time = r1.string[5:]
r3 = li.find_all('div', attrs={"class": "r3"})
if r3 and len(r3) > 0:
for r in r3:
if "內(nèi)容分類" in r.string:
minfo.content_category = r.string[5:]
elif "頻道分類" in r.string:
minfo.channel_category = r.string[5:]
elif "語言字幕" in r.string:
minfo.subtitles = r.string[5:]
elif "最后更新" in r.string:
minfo.last_update_time = r.string[5:]
r5 = li.find('div', attrs={"class": "r5"})
minfo.last_update_time = r5.string[5:]
print("http://www.jlpcn.net" + minfo.href, minfo.title)
resultList.append(minfo)
print(len(li_list))
return resultList
2.3進(jìn)入下一層頁面涨享,獲取更詳細(xì)信息
以http://www.jlpcn.net/vodhtml/3308.html為例筋搏,進(jìn)入頁面后
圖中標(biāo)注的位置的幾個(gè)按鈕,對(duì)應(yīng)的超鏈接就是紀(jì)錄片的百度網(wǎng)盤地址灰伟。分析網(wǎng)頁源代碼拆又,定位到這些超鏈接的位置。
代碼如下栏账,
# 獲取紀(jì)錄片的詳細(xì)信息
def findMovieDetail(self, url):
resultList = []
soup = self.getSoup(url)
if not soup:
return None
down_list_2 = soup.find('div', attrs={"id", "down_list_2"})
if down_list_2:
scripts = down_list_2.find_all('script')
if len(scripts) > 0:
for script in scripts:
print(script.string)
div_list = soup.find_all('div', attrs={"class": "wanpan"})
for div in div_list:
a_bd = div.find('a')
href = a_bd["href"]
text = a_bd.string
if not text:
text = ','.join(a_bd.strings)
text = text.replace(' ', '')
# print(href, text)
detail = models.movie_detail()
detail.cur_url = url
detail.title = text
detail.href = href
resultList.append(detail)
# last_url = resultList[-1].href
# r = requests.get(last_url)
# print(r.text)
return resultList
到這里為止帖族,基本分析的差不多了,剩下的只是將這些信息存儲(chǔ)起來挡爵。
3.全部代碼
# encoding:utf-8
__author__ = "liujinquan"
__date__ = "2018/1/16"
import os
import re
import threading
import traceback
import uuid
import requests
from bs4 import BeautifulSoup as bsp
from sqlalchemy import (
create_engine, )
from sqlalchemy.orm import sessionmaker
import models
# 從http://www.jlpcn.net/爬取記錄片的詳細(xì)信息,存儲(chǔ)到sqlite數(shù)據(jù)庫中
# 讀取slqite數(shù)據(jù)庫, 使用you-get工具下載包含pan.baidu.com的鏈接的紀(jì)錄片
class SearchMoviesBaiduyun(object):
def __init__(self):
super(SearchMoviesBaiduyun, self).__init__()
self.dbpath = r'sqlite:///F:\liujinquan\python\down_movie\movies.db'
engine = create_engine(self.dbpath)
self.Session = sessionmaker(bind=engine)
def getSoup(self, url):
'''使用request獲取網(wǎng)頁源代碼,并傳入bs'''
headers = {
'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
}
try:
r = requests.get(url, headers=headers)
r.encoding = 'utf-8'
soup = bsp(r.text, "html.parser")
return soup
except Exception as identifier:
print('getSoup ex:\n%s' % traceback.format_exc())
return None
# 分析當(dāng)前分類的紀(jì)錄片有多少頁
def findAllLinks(self, url):
'''
紀(jì)錄片分類的地址第一頁
http://www.jlpcn.net/vodtypehtml/1.html
'''
links = []
links.append(url)
soup = self.getSoup(url)
if not soup:
return None
index1 = url.rfind('.')
base1 = url[0:index1]
div = soup.find('div', attrs={"class", "pages"})
if div:
pagestr = re.findall(r'當(dāng)前:1/(.+?)頁', div.text)
if len(pagestr) > 0:
try:
page_cnt = int(pagestr[0])
for x in range(2, page_cnt + 1):
url_t = "{0}-{1}.html".format(base1, x)
links.append(url_t)
except Exception as ex:
traceback.print_exc()
return links
# 在分類頁面中查找當(dāng)前頁面的紀(jì)錄片的概要信息
def findMoives(self, url):
resultList = []
soup = self.getSoup(url)
if not soup:
return None
# print(soup.encode_contents())
li_list = soup.find_all('li', attrs={"class": "mov"})
for li in li_list:
imgbox = li.find('img', attrs={"class": "scrollLoading"})
if imgbox:
minfo = models.movie_summary()
minfo.img_url = imgbox["data-url"]
a_pic = li.find('a', attrs={"class": "pic"})
if a_pic:
minfo.href = a_pic["href"]
minfo.title = a_pic["title"]
minfo.title = minfo.title.replace(' ', '')
r1 = li.find('div', attrs={"class": "r1"})
minfo.update_time = r1.string[5:]
r3 = li.find_all('div', attrs={"class": "r3"})
if r3 and len(r3) > 0:
for r in r3:
if "內(nèi)容分類" in r.string:
minfo.content_category = r.string[5:]
elif "頻道分類" in r.string:
minfo.channel_category = r.string[5:]
elif "語言字幕" in r.string:
minfo.subtitles = r.string[5:]
elif "最后更新" in r.string:
minfo.last_update_time = r.string[5:]
r5 = li.find('div', attrs={"class": "r5"})
minfo.last_update_time = r5.string[5:]
print("http://www.jlpcn.net" + minfo.href, minfo.title)
resultList.append(minfo)
print(len(li_list))
return resultList
# 獲取紀(jì)錄片的詳細(xì)信息
def findMovieDetail(self, url):
resultList = []
soup = self.getSoup(url)
if not soup:
return None
down_list_2 = soup.find('div', attrs={"id", "down_list_2"})
if down_list_2:
scripts = down_list_2.find_all('script')
if len(scripts) > 0:
for script in scripts:
print(script.string)
div_list = soup.find_all('div', attrs={"class": "wanpan"})
for div in div_list:
a_bd = div.find('a')
href = a_bd["href"]
text = a_bd.string
if not text:
text = ','.join(a_bd.strings)
text = text.replace(' ', '')
# print(href, text)
detail = models.movie_detail()
detail.cur_url = url
detail.title = text
detail.href = href
resultList.append(detail)
# last_url = resultList[-1].href
# r = requests.get(last_url)
# print(r.text)
return resultList
# 查找某種分類的所有紀(jì)錄片的概要和詳細(xì)信息,存儲(chǔ)在數(shù)據(jù)庫中
def searchAllLinks(self, url1):
base_url = "http://www.jlpcn.net/"
results = []
links = self.findAllLinks(url1)
if len(links) > 0:
for url in links:
print("searching -> {0}".format(url))
movies = self.findMoives(url)
if len(movies) > 0:
for m in movies:
self.saveToSummaryTable(
self.convertToMovieSummary(base_url, m))
url_d = base_url + m.href
# print(url_d)
details = self.findMovieDetail(url_d)
if len(details) > 0:
for d in details:
# if "pan.baidu.com" in d.href:
soup1 = self.getSoup(d.href)
if not soup1:
continue
title1 = soup1.title.string
d.video_name = m.title.replace(
' ', ''
) + "_" + d.title + self.getMovieType(title1)
self.saveToDetailTable(
self.convertToMovieDetail(d))
print(d.href, title1, d.video_name)
results.append(d)
# for r in results:
# print(r.href, r.title, r.cur_url)
# print("result len: {0}".format(len(results)))
# list_url = [x.href for x in results]
# moveToBaiduYun(list_url)
# s2 = json.dumps(
# results,
# default=lambda obj: obj.__dict__,
# sort_keys=True,
# indent=None,
# ensure_ascii=False)
# print(s2)
return results
def getMovieType(self, title):
if ".avi" in title:
return ".avi"
elif ".mp4" in title:
return ".mp4"
elif ".rmvb" in title:
return ".rmvb"
elif ".mkv" in title:
return ".mkv"
elif ".ts" in title:
return ".ts"
else:
return ".avi"
def saveToDetailTable(self, detail):
try:
if isinstance(detail, models.MovieDetail):
session = self.Session()
detail.md_id = str(uuid.uuid1())
session.add(detail)
session.commit()
session.close()
except Exception as identifier:
print('saveToDetailTable ex:\n%s' % traceback.format_exc())
def saveToSummaryTable(self, summary):
try:
if isinstance(summary, models.MovieSummary):
session = self.Session()
summary.m_id = str(uuid.uuid1())
session.add(summary)
session.commit()
session.close()
except Exception as identifier:
print('saveToSummaryTable ex:\n%s' % traceback.format_exc())
def convertToMovieSummary(self, base_url, movie):
md = models.MovieSummary()
md.title = movie.title
md.href = base_url + movie.href
md.img_url = base_url + movie.img_url
md.update_time = movie.update_time
md.content_category = movie.content_category
md.channel_category = movie.channel_category
md.subtitles = movie.subtitles
md.last_update_time = movie.last_update_time
md.summary = movie.summary
return md
def convertToMovieDetail(self, detail):
d = models.MovieDetail()
d.cur_url = detail.cur_url
d.title = detail.title
d.href = detail.href
d.video_name = detail.video_name
return d
if __name__ == '__main__':
search = SearchMoviesBaiduyun()
types = [
32, 20, 29, 31, 36, 30, 28, 27, 24, 19, 25, 39, 38, 22, 21, 37, 40, 23,
33, 34, 35, 26, 46, 47, 44, 41, 42, 45
]
for t in types:
url1 = r'http://www.jlpcn.net/vodtypehtml/{0}.html'.format(t)
search.searchAllLinks(url1)
還有對(duì)應(yīng)的model類
# coding: utf-8
from sqlalchemy import Column, Text, text
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
metadata = Base.metadata
class MovieDetail(Base):
__tablename__ = 'movie_detail'
md_id = Column(Text(36), primary_key=True)
cur_url = Column(Text(256))
title = Column(Text(128))
href = Column(Text(512))
video_name = Column(Text(128))
is_downloaded = Column(Text(3), server_default=text("'0'"))
down_time = Column(Text(32))
class MovieSummary(Base):
__tablename__ = 'movie_summary'
m_id = Column(Text(36), primary_key=True)
title = Column(Text(50))
href = Column(Text(255))
img_url = Column(Text(255))
update_time = Column(Text(32))
content_category = Column(Text(128))
channel_category = Column(Text(128))
subtitles = Column(Text(512))
last_update_time = Column(Text(32))
summary = Column(Text(512))
# 業(yè)務(wù)中使用的實(shí)體類
class movie_summary(object):
def __init__(self):
super(movie_summary, self).__init__()
self.title = ""
self.href = ""
self.img_url = ""
self.update_time = ""
self.content_category = ""
self.channel_category = ""
self.subtitles = ""
self.last_update_time = ""
self.summary = ""
class movie_detail(object):
def __init__(self):
super(movie_detail, self).__init__()
self.cur_url = ""
self.title = ""
self.href = ""
self.video_name = ""
2018/1/18 下午
前面部分的代碼只是采集了電影的百度網(wǎng)盤連接竖般,最終存儲(chǔ)在sqlite數(shù)據(jù)庫中,復(fù)制這些到瀏覽器后可直接觀看電影茶鹃。但是如何通過這些鏈接將電影下載到本地有一些難度涣雕。其實(shí)在電影詳細(xì)頁面支持迅雷下載艰亮,從源代碼中也可以搜索到磁力鏈接。
將這些磁力鏈接或ed2k鏈接解析并保存挣郭,然后使用aria2或者迅雷下載也是不錯(cuò)的方案
# 獲取紀(jì)錄片的詳細(xì)信息
def findMovieDetail(self, url):
resultList = []
soup = self.getSoup(url)
if not soup:
return None
down_list_2 = soup.find(id="down_list_2")
if down_list_2:
# print(down_list_2)
scripts = down_list_2.find_all(
'script', text=re.compile(r'ThunderEncode'))
# print(len(scripts))
if len(scripts) > 0:
for script in scripts:
s = str(script.string)
# 找到磁力鏈接
flag1 = r'ThunderEncode("'
index1 = s.index(flag1) + len(flag1)
index2 = s.index(r'"', index1)
href_str = s[index1:index2]
# 找到標(biāo)題
flag2 = r'file_name="'
index3 = s.index(flag2) + len(flag2)
index4 = s.index(r'"', index3)
title_str = s[index3:index4]
# 緩存到列表中
detail = models.movie_detail()
detail.cur_url = url
detail.title = title_str.replace(' ', '')
detail.href = href_str
resultList.append(detail)
return resultList
2018/2/5 下午
獲取這些記錄片的資源后迄埃,就想直接下載自己的硬盤中。
前面的代碼中我們可以獲取百度鏈接和磁力鏈接兩種資源方式兑障,磁力資源本身獲取的比較少侄非,而且大部分已經(jīng)過期,及時(shí)使用迅雷也無法下載流译。百度連接的資源前后也嘗試過you-get, aria2c等工具逞怨,效果都不是很理想。不是速度受限就是根本不能下載福澡。
后來想想還是先轉(zhuǎn)存到百度網(wǎng)盤叠赦,然后批量下載。關(guān)于百度網(wǎng)盤鏈接轉(zhuǎn)存到自己的判斷網(wǎng)盤革砸,網(wǎng)上有一篇文章除秀,但我沒有嘗試成功,只好采用selenium的方式业岁。
# encoding:utf-8
__author__ = "liujinquan"
__date__ = "2018/1/28"
import datetime
import json
import logging
import os
import re
import threading
import time
import traceback
import urllib.parse
import uuid
import requests
from bs4 import BeautifulSoup as bsp
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import models
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
chromedriver = "C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe"
os.environ["webdriver.chrome.driver"] = chromedriver
profile_dir = r"C:\Users\Administrator\AppData\Local\Mozilla\Firefox\Profiles\cqupe01d.default"
profile = webdriver.FirefoxProfile(profile_dir)
driver = webdriver.Firefox(profile)
# from selenium import webdriver
# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# dcap = dict(DesiredCapabilities.PHANTOMJS) #設(shè)置userAgent
# dcap["phantomjs.page.settings.userAgent"] = (
# "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
# )
class MoveToBaiduYun(object):
def __init__(self):
super(MoveToBaiduYun, self).__init__()
self.dbpath = r'sqlite:///F:\liujinquan\python\down_movie\movies.db'
self.engine = create_engine(self.dbpath)
self.Session = sessionmaker(bind=self.engine)
def getAllBaiduLinks(self):
try:
session = self.Session()
links = session.query(models.MovieDetail.href).all()
session.commit()
session.close()
print(len(links))
return list(links)
except Exception as identifier:
print('getAllBaiduLinks ex:\n%s' % traceback.format_exc())
return None
def moveToBaiduYun(self, list_url):
# url = 'https://pan.baidu.com/s/1o8ID1hC'
# 這里可以用Chrome鳞仙、Phantomjs等,如果沒有加入環(huán)境變量笔时,需要指定具體的位置
# options = webdriver.ChromeOptions()
# options.add_argument(
# "--user-data-dir=" +
# r"C:/Users/Administrator/AppData/Local/Google/Chrome/User Data")
# driver = webdriver.Chrome(
# executable_path=chromedriver, options=options)
# driver.maximize_window()
# driver = webdriver.PhantomJS(
# executable_path='C:\Python\Python36\Scripts\phantomjs.exe',
# desired_capabilities=dcap) #加載網(wǎng)址
# driver.maximize_window()
profile = webdriver.FirefoxProfile(profile_dir)
driver = webdriver.Firefox(profile)
driver.maximize_window()
for url in list_url:
driver.get(url)
print('開始登錄:' + url)
try:
save_to_pans = driver.find_element_by_class_name(
"bar").find_elements_by_css_selector(
"[class='g-button g-button-blue']")
print(len(save_to_pans))
for tag in save_to_pans:
print(tag.text)
time.sleep(1)
if tag.get_attribute("data-button-id") == u'b1':
print("find target.")
time.sleep(1)
tag.click()
# for x in range(1, 10):
# time.sleep(1)
# tag.click()
time.sleep(1)
driver.switch_to_default_content()
save_buttons = driver.find_element_by_id(
"fileTreeDialog").find_element_by_css_selector(
"[class='dialog-footer g-clearfix']"
).find_elements_by_css_selector(
"[class='g-button g-button-blue-large']")
print(len(save_buttons))
for btn in save_buttons:
if btn.get_attribute("data-button-id") == u'b13':
print("find target again!")
time.sleep(1)
btn.click()
break
time.sleep(3)
except Exception as identifier:
logging.error('down_movies ex:\n%s' % traceback.format_exc())
return driver.get_cookies()
def moveToBaiduYun_OldUrl(self, list_url):
profile = webdriver.FirefoxProfile(profile_dir)
driver = webdriver.Firefox(profile)
driver.maximize_window()
for url in list_url:
driver.get(url)
print('開始登錄:' + url)
try:
# save_to_pans = driver.find_element_by_class_name(
# "bar").find_elements_by_css_selector(
# "[class='g-button g-button-blue']")
save_to_pans = driver.find_element_by_id('emphsizeButton')
if save_to_pans:
print("find target")
print(save_to_pans.text)
time.sleep(0.5)
save_to_pans.click()
time.sleep(0.5)
driver.switch_to_default_content()
save_buttons = driver.find_element_by_id('_disk_id_8')
if save_buttons:
print("find target again!")
time.sleep(0.5)
save_buttons.click()
time.sleep(3)
except Exception as identifier:
logging.error('down_movies ex:\n%s' % traceback.format_exc())
return driver.get_cookies()
if __name__ == '__main__':
move = MoveToBaiduYun()
links = move.getAllBaiduLinks()
print(links[0], links[1])
# links = [x[0] for x in links if 'pan.baidu.com' in x[0]]
# print(len(links))
# move.moveToBaiduYun(links)
links = [x[0] for x in links if 'yun.baidu.com' in x[0]]
print(len(links))
move.moveToBaiduYun_OldUrl(links)