想爬豆瓣相冊起因是因為甚颂,想將豆瓣照片轉(zhuǎn)移到其他地方蜜猾,但是豆瓣的照片下載下來都是webp格式,正常情況下沒法打開振诬,所以想到能不能通過selenium爬到數(shù)據(jù)呢蹭睡?
E:\IDEAProject\MyProject\SeleniumTest
本來想的是從模擬登陸開始,然后一步步進到相冊里赶么,獲取相冊圖片列表肩豁,但是比較麻煩。還有一個更方便的操作辫呻,通過F12開發(fā)者工具清钥,查看請求的url,帶上cookie去模擬請求
進到相冊首頁
GET請求帶cookie放闺,https://www.douban.com/people/62414040/photos
通過對Response的分析
看到相冊信息以及相冊地址都在下圖所示的信息塊中
通過對該信息塊進行信息提取祟昭,可以得到相冊名列表和相冊地址url列表
進到相冊詳情頁
GET請求帶cookie,https://www.douban.com/photos/album/1871536872/
如果相冊里圖片數(shù)量超過了18張怖侦,就會有第二頁
GET請求帶cookie篡悟,https://www.douban.com/photos/album/1871536872/?m_start=18
同理,如果有第三頁第四頁匾寝,url是
https://www.douban.com/photos/album/1871536872/?m_start=36
https://www.douban.com/photos/album/1871536872/?m_start=54
通過對Response的分析
看到相冊信息以及相冊中每張圖片url都在下圖所示的信息塊中
有了圖片地址就好辦了搬葬,使用IO流下載圖片
代碼如下
獲取每個相冊中圖片的地址,返回集合
/**
* GET方式
* 獲取每個相冊中圖片的地址艳悔,返回集合
*/
public List<String> getImageUrlList(String passUrl, String folderName) throws BusinessException {
//第五步:在獲取以上憑證后開始采集數(shù)據(jù)
logger.info("獲取豆瓣相冊" + folderName + "圖片地址");
// String passUrl = "https://www.douban.com/photos/album/1871536796/";
String passCookie = "gmxq-IDUYXw; douban-fav-remind=1; __yadk_uid=bK748gKAh8REVU6PsYrAF24ZroyVxSYA; " +
"__gads=ID=9cf216a578728a98:T=1587890575:S=ALNI_MbpwRrOxhrMiUZPlDmEcg-YeTzuhw; ll=\"118159\"; " +
"__utmc=30149280; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.6241; " +
"douban-profile-remind=1; ct=y; __utmz=30149280.1588515963.4.3.utmcsr=baidu|utmccn=(organic)" +
"|utmcmd=organic|utmctr=%E8%B1%86%E7%93%A3%E7%9B%B8%E5%86%8C%E4%B8%8B%E8%BD%BD%E6%80%8E%E4%B9%88%E6%98" +
"%AFwebp; dbcl2=\"62414040:cC/OMBA010s\"; ck=snAu; gr_user_id=7f5f3e5d-9cf9-4d38-9e3e-8ffebe3b22b1; " +
"_pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1588520368%2C%22https%3A%2F%2Fwww.baidu" +
".com%2Fbaidu%3Fisource%3Dinfinity%26iname%3Dbaidu%26itype%3Dweb%26tn%3D02003390_42_hao_pg%26ie%3Dutf-8" +
"%26wd%3D5kg%25E5%25A4%25A7%25E7%25B1%25B3%25E4%25B8%2580%25E4%25B8%25AA%25E4%25BA%25BA%25E5%2590%2583" +
"%25E5%25A4%259A%25E4%25B9%2585%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.835562099.1587890488" +
".1588515963.1588520368.5; __utmt=1; _pk_id.100001.8cb4=d2bb86d9f8460bdf.1587890487.3.1588520483" +
".1588517154.; __utmb=30149280.12.10.1588520368";
List<String> imageUrl = new ArrayList<>();
//請求url帶上cookie
Document document = RequestUtil.sendGetRequestWithNullReturnDoc(passUrl, passCookie);
//圖片元素列表
Elements photo_wrap = document.getElementsByClass("photo_wrap");
//圖片元素列表for循環(huán)急凰,單個元素有哪些子元素,通過getElementsByTag方法得到子元素后就猜年,再調(diào)用attr方法獲取屬性值
for (Element element : photo_wrap) {
Elements allElements = element.getAllElements();
Elements img = element.getElementsByTag("img");
String src = img.attr("src");
System.out.println(src);
//將webp替換成jpg香府,一樣可以打開圖片
src = src.replace("webp", "jpg");
imageUrl.add(src);
}
return imageUrl;
}
下載圖片方法
/**
* java 通過url下載圖片保存到本地
*
* @param urlString 圖片url地址
* @param i 圖片保存名稱,雪花算法【需要引入mybatis-plus】
* @throws Exception
*/
public static void download(String urlString, String basePath, String folderName, Long i) throws Exception {
// 構(gòu)造URL
URL url = new URL(urlString);
// 打開連接
URLConnection con = url.openConnection();
// 輸入流
InputStream is = con.getInputStream();
// 1K的數(shù)據(jù)緩沖
byte[] bs = new byte[1024];
// 讀取到的數(shù)據(jù)長度
int len;
//完整文件夾名
File file0 = new File(basePath + "\\" + folderName);
if (!file0.isDirectory() && !file0.exists()) {
file0.mkdirs();
}
// 輸出的文件流
String filename = file0 + "\\" + i + ".jpg"; //下載路徑及下載圖片名稱
File file = new File(filename);
FileOutputStream os = new FileOutputStream(file, true);
// 開始讀取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
System.out.println(i);
// 完畢码倦,關(guān)閉所有鏈接
os.close();
is.close();
}
}
}
獲取相冊的集合
/**
* 獲取相冊的集合
*/
public List<List<String>> getAlbumList(String passUrl) throws BusinessException {
logger.info("圖片首頁");
String passCookie = "gmxq-IDUYXw; douban-fav-remind=1; __yadk_uid=bK748gKAh8REVU6PsYrAF24ZroyVxSYA; " +
"__gads=ID=9cf216a578728a98:T=1587890575:S=ALNI_MbpwRrOxhrMiUZPlDmEcg-YeTzuhw; ll=\"118159\"; " +
"__utmc=30149280; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.6241; " +
"douban-profile-remind=1; ct=y; __utmz=30149280.1588515963.4.3.utmcsr=baidu|utmccn=(organic)" +
"|utmcmd=organic|utmctr=%E8%B1%86%E7%93%A3%E7%9B%B8%E5%86%8C%E4%B8%8B%E8%BD%BD%E6%80%8E%E4%B9%88%E6%98" +
"%AFwebp; dbcl2=\"62414040:cC/OMBA010s\"; ck=snAu; gr_user_id=7f5f3e5d-9cf9-4d38-9e3e-8ffebe3b22b1; " +
"_pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1588520368%2C%22https%3A%2F%2Fwww.baidu" +
".com%2Fbaidu%3Fisource%3Dinfinity%26iname%3Dbaidu%26itype%3Dweb%26tn%3D02003390_42_hao_pg%26ie%3Dutf-8" +
"%26wd%3D5kg%25E5%25A4%25A7%25E7%25B1%25B3%25E4%25B8%2580%25E4%25B8%25AA%25E4%25BA%25BA%25E5%2590%2583" +
"%25E5%25A4%259A%25E4%25B9%2585%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.835562099.1587890488" +
".1588515963.1588520368.5; __utmt=1; _pk_id.100001.8cb4=d2bb86d9f8460bdf.1587890487.3.1588520483" +
".1588517154.; __utmb=30149280.12.10.1588520368";
//相冊的集合
List<List<String>> albumListAll = new ArrayList<>();
//請求url帶上cookie
Document document = RequestUtil.sendGetRequestWithNullReturnDoc(passUrl, passCookie);
Elements allElements = document.getAllElements();
Elements albumlst = document.getElementsByClass("albumlst");
for (Element element : albumlst) {
List<String> albumList = new ArrayList<>();
Elements elements = element.getElementsByClass("albumlst");
Element element1 = elements.get(0);
String href = element1.getElementsByClass("album_photo").attr("href");
String pl = element1.getElementsByClass("pl").text();
String substring = pl.substring(0, pl.indexOf("張"));
int totalRecord = Integer.parseInt(substring);
//totalRecord:總記錄數(shù) / pageSize:每頁多少條記錄 /totalPageNum:總頁數(shù)
int pageSize = 18;
//分頁的總頁數(shù)算法
int totalPageNum = (totalRecord + pageSize - 1) / pageSize;
if (totalPageNum == 1) {
albumList.add(href);
} else if (totalPageNum >= 2) {
albumList.add(href);
for (int i = 1; i < totalPageNum; i++) {
String s = href + "?m_start=" + pageSize * i;
albumList.add(s);
}
} else {
}
albumListAll.add(albumList);
}
return albumListAll;
}
獲取相冊名的集合
/**
* 獲取相冊名的集合
*/
public List<String> getAlbumNameList(String passUrl) throws BusinessException {
logger.info("圖片首頁");
String passCookie = "gmxq-IDUYXw; douban-fav-remind=1; __yadk_uid=bK748gKAh8REVU6PsYrAF24ZroyVxSYA; " +
"__gads=ID=9cf216a578728a98:T=1587890575:S=ALNI_MbpwRrOxhrMiUZPlDmEcg-YeTzuhw; ll=\"118159\"; " +
"__utmc=30149280; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.6241; " +
"douban-profile-remind=1; ct=y; __utmz=30149280.1588515963.4.3.utmcsr=baidu|utmccn=(organic)" +
"|utmcmd=organic|utmctr=%E8%B1%86%E7%93%A3%E7%9B%B8%E5%86%8C%E4%B8%8B%E8%BD%BD%E6%80%8E%E4%B9%88%E6%98" +
"%AFwebp; dbcl2=\"62414040:cC/OMBA010s\"; ck=snAu; gr_user_id=7f5f3e5d-9cf9-4d38-9e3e-8ffebe3b22b1; " +
"_pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1588520368%2C%22https%3A%2F%2Fwww.baidu" +
".com%2Fbaidu%3Fisource%3Dinfinity%26iname%3Dbaidu%26itype%3Dweb%26tn%3D02003390_42_hao_pg%26ie%3Dutf-8" +
"%26wd%3D5kg%25E5%25A4%25A7%25E7%25B1%25B3%25E4%25B8%2580%25E4%25B8%25AA%25E4%25BA%25BA%25E5%2590%2583" +
"%25E5%25A4%259A%25E4%25B9%2585%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.835562099.1587890488" +
".1588515963.1588520368.5; __utmt=1; _pk_id.100001.8cb4=d2bb86d9f8460bdf.1587890487.3.1588520483" +
".1588517154.; __utmb=30149280.12.10.1588520368";
//相冊的集合
List<String> albumNameList = new ArrayList<>();
//請求url帶上cookie
Document document = RequestUtil.sendGetRequestWithNullReturnDoc(passUrl, passCookie);
Elements allElements = document.getAllElements();
Elements albumlst = document.getElementsByClass("albumlst");
for (Element element : albumlst) {
Elements elements = element.getElementsByClass("albumlst");
Element element1 = elements.get(0);
Elements elements1 = element1.getElementsByClass("pl2");
String albumName = elements1.text();
albumNameList.add(albumName);
}
return albumNameList;
}