繼續(xù)上次的知乎爬蟲, 這次開始了嗶哩嗶哩的爬蟲實(shí)踐悼嫉;
首先介紹下如何下載吧: VideoHelper 里面有三種方式下載b站視頻。
同樣的流程拼窥, 還是先抓包戏蔑,分析參數(shù)蹋凝,尋找參數(shù)(包括之前的請求包和頁面源碼),找出視頻真實(shí)地址总棵, 然后在模擬鳍寂。
抓包是注意幾個(gè)參數(shù):
aid:每個(gè)視頻都會有對應(yīng)的 aid, 包括ep類型的情龄;
cid:彈幕的id伐割, 通過相關(guān)api可由cid找到對應(yīng)的資源列表
ep_id: 就是地址欄上顯示的ep類型的id了
這里詳細(xì)的流程我就不介紹了(其實(shí)我是來宣傳VideoHelper 的,目前還支持知乎等網(wǎng)站視頻, 歡迎star刃唤「粜模滑稽‘(>﹏<))
其中需要注意的是模擬發(fā)包是有些請求頭是不能掉的, user-agent我就不說了尚胞, 不如Referer硬霍;
另外我發(fā)現(xiàn)網(wǎng)上目前僅存的b站的視頻爬蟲好像大多不支持ep類型的, 不過我那個(gè)最近測試是支持了的笼裳, 但是vip專屬的也是會直接報(bào)錯(cuò)唯卖;
另外注明:該項(xiàng)目參考了you-get的部分api
下面老規(guī)矩貼上主要源碼:
package website;
import bean.BilibiliBean;
import bean.VideoBean;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import util.DownloadUtil;
import util.HttpUtil;
import util.MD5Encoder;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.*;
import static util.PrintUtil.println;
/**
@author Asche
@date 2018-10-20 18:02:29
-
@github https://github.com/asche910
*/
public class Bilibili extends BaseSite {
// from aid to cids
private String ApiGetList = "https://www.bilibili.com/widget/getPageList?aid=";
private String AvApi = "http://interface.bilibili.com/v2/playurl?";
private String EpApi = "http://bangumi.bilibili.com/player/web_api/playurl?";
private String SEC_1 = "94aba54af9065f71de72f5508f1cd42e";
private String SEC_2 = "9b288147e5474dd2aa67085f716c560d";// quality
private final int RESOLUTION_1080 = 112;
private final int RESOLUTION_720 = 64;
private final int RESOLUTION_480 = 32;
private final int RESOLUTION_360 = 15;private int quality = RESOLUTION_1080;
// private List<String> urls = new ArrayList<>();
private String playUrl;
private String fileName;
private int timeLength;
private int fileSize = 0;
private int aid;
private int cid;
// 視頻類型
private final int AV_VIDEO = 1;
private final int EP_VIDEO = 2;
private final int SS_VIDEO = 3;
private int type = AV_VIDEO;
private boolean isSupported;
// ep的關(guān)聯(lián)系列
private List<BilibiliBean> serialList = new ArrayList<>();
// 是否已經(jīng)解析
private boolean isResolved;
public Bilibili() {
}
/**
* 先獲取信息再決定是否下載
* @param playUrl
* @param outputDir
*/
public Bilibili(String playUrl, String outputDir) {
if (!isResolved) {
this.playUrl = playUrl;
String[] strs = playUrl.split("/");
for (String str : strs) {
if (str.matches("av\\d{4,}")) {
aid = Integer.parseInt(str.substring(2));
isSupported = true;
break;
} else if(str.matches("ep\\d{4,}")){
type = EP_VIDEO;
isSupported = true;
break;
} else if(str.matches("ss\\d{4,}")){
type = SS_VIDEO;
isSupported = true;
break;
}
}
try {
switch (type) {
case SS_VIDEO:
case EP_VIDEO:
initEp();
String epApi = generateEpApi(EpApi, cid, quality);
println(epApi);
parseEpApiResponse(epApi);
break;
case AV_VIDEO:
initAv();
String avApi = generateAvApi(AvApi, cid, quality);
println(avApi);
parseAvApiResponse(avApi);
break;
}
} catch (Exception e) {
e.printStackTrace();
}
isResolved = true;
}
}
@Override
public void downloadByUrl(String playUrl, String outputDir) {
println("Bilibili start: ");
this.playUrl = playUrl;
String[] strs = playUrl.split("/");
for (String str : strs) {
if (str.matches("av\\d{4,}")) {
aid = Integer.parseInt(str.substring(2));
isSupported = true;
break;
} else if(str.matches("ep\\d{4,}")){
type = EP_VIDEO;
isSupported = true;
break;
} else if(str.matches("ss\\d{4,}")){
type = SS_VIDEO;
isSupported = true;
break;
}
}
try {
if (!isResolved) {
switch (type) {
case SS_VIDEO:
case EP_VIDEO:
initEp();
String epApi = generateEpApi(EpApi, cid, quality);
println(epApi);
parseEpApiResponse(epApi);
break;
case AV_VIDEO:
initAv();
String avApi = generateAvApi(AvApi, cid, quality);
println(avApi);
parseAvApiResponse(avApi);
break;
}
isResolved = true;
}
println("# Title: " + fileName);
println(" -TimeLength: " + timeLength / 1000 / 60 + ":" + String.format("%02d", timeLength / 1000 % 60));
println(" -File Size: " + fileSize / 1024 / 1024 + " M");
download(urls, outputDir);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 內(nèi)部下載入口
*
* @param videoSrcs
* @param outputDir
*/
@Override
public void download(List<String> videoSrcs, String outputDir) throws IOException {
Map<String, List<String>> headerMap = new HashMap<>();
// 缺失Referer會導(dǎo)致453錯(cuò)誤
headerMap.put("Referer", Collections.singletonList("http://interface.bilibili.com/v2/playurl?appkey=84956560bc028eb7&cid=59389212&otype=json&qn=3&quality=3&type=&sign=4c841d687bb7e479e3111428c6a4d3b8"));
int index = 0;
for (String src : videoSrcs) {
println("Download: " + ++index + "/" + videoSrcs.size());
String fileDir;
if (videoSrcs.size() == 1) {
fileDir = outputDir + File.separatorChar + fileName.replaceAll("[/|\\\\]", "") + ".flv";
} else {
fileDir = outputDir + File.separatorChar + fileName.replaceAll("[/|\\\\]", "") + "【" + index + "】.flv";
}
DownloadUtil.downloadVideo(src, fileDir, headerMap);
}
println("Download: All Done!");
}
@Override
public VideoBean getInfo() {
VideoBean bean = new VideoBean();
bean.setTitle(fileName);
bean.setTimeLength(timeLength / 1000 / 60 + ":" + String.format("%02d", timeLength / 1000 % 60));
bean.setSize(fileSize / 1024 / 1024);
return bean;
}
public List<BilibiliBean> getSerialList(){
return serialList;
}
/**
* cid, fileName
*
* @throws IOException
*/
private void initAv() throws IOException {
String result = HttpUtil.getResponseContent(ApiGetList + aid);
JSONObject jb = (JSONObject) new JSONArray(result).get(0);
cid = jb.getInt("cid");
Document doc = Jsoup.connect(playUrl).get();
Element ele = doc.selectFirst("div[id=viewbox_report]").selectFirst("h1");
if (ele.hasAttr("title"))
fileName = ele.attr("title");
}
/**
* cid, fileName and related eps
*
* @throws IOException
*/
private void initEp() throws IOException {
Document doc = Jsoup.connect(playUrl).get();
Element ele = doc.body().child(2);
String preResult = ele.toString();
// println(preResult);
String result = preResult.substring(preResult.indexOf("__=") + 3, preResult.indexOf(";(function()"));
// println(result);
JSONObject object = new JSONObject(result);
JSONObject curEpInfo = object.getJSONObject("epInfo");
fileName = object.getJSONObject("mediaInfo").getString("title");
cid = curEpInfo.getInt("cid");
JSONArray ja = object.getJSONArray("epList");
for (Object obj : ja) {
JSONObject epObject = (JSONObject) obj;
int aid = epObject.getInt("aid");
int cid = epObject.getInt("cid");
int duration = epObject.getInt("duration");
int epId = epObject.getInt("ep_id");
String index = epObject.getString("index");
String indexTitle = epObject.getString("index_title");
BilibiliBean bean = new BilibiliBean(aid, cid, duration, epId, index, indexTitle);
serialList.add(bean);
println(bean.toString());
}
}
/**
* timeLength, fileSize, urls
*
* @param avReqApi
* @throws IOException
*/
private void parseAvApiResponse(String avReqApi) throws IOException {
String result = HttpUtil.getResponseContent(avReqApi);
// println(result);
JSONObject jsonObject = new JSONObject(result);
timeLength = jsonObject.getInt("timelength");
JSONArray ja = jsonObject.getJSONArray("durl");
Iterator<Object> iterator = ja.iterator();
while (iterator.hasNext()) {
JSONObject jb = (JSONObject) iterator.next();
String videoSrc = jb.getString("url");
urls.add(videoSrc);
fileSize += jb.getInt("size");
}
}
/**
* timeLength, fileSize, urls
*
* @param epReqApi
* @throws IOException
* @throws DocumentException
*/
private void parseEpApiResponse(String epReqApi) throws IOException, DocumentException {
String response = HttpUtil.getResponseContent(epReqApi);
SAXReader reader = new SAXReader();
org.dom4j.Element rootElement = reader.read(new ByteArrayInputStream(response.getBytes("utf-8"))).getRootElement();
timeLength = Integer.parseInt(rootElement.element("timelength").getText().trim());
List<org.dom4j.Element> elements = rootElement.elements("durl");
for (org.dom4j.Element ele : elements) {
int curSize = Integer.parseInt(ele.element("size").getText());
fileSize += curSize;
String url = ele.element("url").getText();
urls.add(url);
}
println(fileName + ": " + fileSize / 1024 / 1024 + "M");
}
/**
* 生成av類型視頻下載信息的api請求鏈接
*
* @param url
* @param cid
* @param quality
* @return
*/
private String generateAvApi(String url, int cid, int quality) {
String paramStr = String.format("appkey=84956560bc028eb7&cid=%d&otype=json&qn=%d&quality=%d&type=", cid, quality, quality);
try {
String checkSum = MD5Encoder.md5(paramStr + SEC_1).toLowerCase();
return url + paramStr + "&sign=" + checkSum;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 生成ep類型視頻下載信息的api請求鏈接
*
* @param url
* @param cid
* @param quality
* @return
*/
private String generateEpApi(String url, int cid, int quality) {
String paramStr = String.format("cid=%d&module=bangumi&player=1&quality=%d&ts=%s",
cid, quality, System.currentTimeMillis() / 1000 + "");
try {
String checkSum = MD5Encoder.md5(paramStr + SEC_2).toLowerCase();
return url + paramStr + "&sign=" + checkSum;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}
完整代碼位于: