獲取每個(gè)頁(yè)面圖片鏈接地址
package com.wxq.pachong;
import com.alibaba.fastjson.JSON;
import java.util.ArrayList;
import java.util.List;
/**
* @title:
* @description:
* @author:
* @date:2019/3/11 15:09
**/
public class JianDanHtmlParserimplements Runnable{
private Stringhtml;
? ? private int page;
? ? public JianDanHtmlParser(String html,int page) {
this.html = html;
? ? ? ? this.page = page;
? ? }
@Override
? ? public void run() {
System.out.println("==========第"+page+"頁(yè)============");
? ? ? ? List list =new ArrayList();
? ? ? ? html =html.substring(html.indexOf("list"));
? ? ? ? String[] images =html.split("li>");
? ? ? ? for (String image : images) {
String[] ss = image.split("br");
? ? ? ? ? ? for (String s : ss) {
if (s.indexOf("<img src=") >0) {
try{
int i = s.indexOf("<img src=\"") +"<img src=\"".length();
? ? ? ? ? ? ? ? ? ? ? ? list.add(s.substring(i, s.indexOf("\"", i +1)));
? ? ? ? ? ? ? ? ? ? }catch (Exception e) {
System.out.println(s);
? ? ? ? ? ? ? ? ? ? }
}
}
}
for(String imageUrl : list){
System.out.println("圖片鏈接:"+ JSON.toJSONString(imageUrl));
? ? ? ? ? ? if(imageUrl.indexOf("uploads")>0){
new Thread(new JianDanImageCreator(imageUrl,page)).start();
? ? ? ? ? ? }
}
}
}
將圖片鏈接下載到本地
package com.wxq.pachong;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
/**
* @title:
* @description:
* @author:
* @date:2019/3/11 15:09
**/
public class JianDanImageCreatorimplements Runnable{
private static int count =0;
? ? private StringimageUrl;
? ? private int page;
? ? //存儲(chǔ)路徑,自定義
? ? private static final StringbasePath ="E:/jiandan";
? ? public JianDanImageCreator(String imageUrl,int page) {
this.imageUrl = imageUrl;
? ? ? ? this.page = page;
? ? }
@Override
? ? public void run() {
File dir =new File(basePath);
? ? ? ? if(!dir.exists()){
dir.mkdirs();
? ? ? ? ? ? System.out.println("圖片存放于"+basePath+"目錄下");
? ? ? ? }
String imageName =imageUrl.substring(imageUrl.lastIndexOf("/")+1);
? ? ? ? try {
File file =new File(basePath+"/"+page+"--"+imageName);
? ? ? ? ? ? OutputStream os =new FileOutputStream(file);
? ? ? ? ? ? //創(chuàng)建一個(gè)url對(duì)象
? ? ? ? ? ? URL url =new URL(imageUrl);
? ? ? ? ? ? InputStream is = url.openStream();
? ? ? ? ? ? byte[] buff =new byte[1024];
? ? ? ? ? ? while(true) {
int readed = is.read(buff);
? ? ? ? ? ? ? ? if(readed == -1) {
break;
? ? ? ? ? ? ? ? }
byte[] temp =new byte[readed];
? ? ? ? ? ? ? ? System.arraycopy(buff, 0, temp, 0, readed);
? ? ? ? ? ? ? ? //寫(xiě)入文件
? ? ? ? ? ? ? ? os.write(temp);
? ? ? ? ? ? }
System.out.println("第"+(count++)+"張妹子:"+file.getAbsolutePath());
? ? ? ? ? ? is.close();
? ? ? ? ? ? os.close();
? ? ? ? }catch (Exception e) {
e.printStackTrace();
? ? ? ? }
}
}
爬蟲(chóng)執(zhí)行方法
package com.wxq.pachong;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import java.io.InputStream;
/**
* @title:
* @description:
* @author:
* @date:2019/3/11 15:07
**/
public class SimpleSpider {
//起始頁(yè)碼
? ? private static final int page =1264;
? ? public static void main(String[] args) {
//HttpClient 超時(shí)配置
? ? ? ? RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).setConnectionRequestTimeout(6000).setConnectTimeout(6000).build();
? ? ? ? CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(globalConfig).build();
? ? ? ? System.out.println("5秒后開(kāi)始抓取煎蛋妹子圖……");
? ? ? ? for (int i =page; i >0; i--) {
//創(chuàng)建一個(gè)GET請(qǐng)求
? ? ? ? ? ? HttpGet httpGet =new HttpGet("http://www.jf258.com/nvsheng/"+ i+"1.html");
? ? ? ? ? ? httpGet.addHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36");
? ? ? ? ? ? httpGet.addHeader("Cookie","_gat=1; nsfw-click-load=off; gif-click-load=on; _ga=GA1.2.1861846600.1423061484");
? ? ? ? ? ? try {
//不敢爬太快
? ? ? ? ? ? ? ? Thread.sleep(5000);
? ? ? ? ? ? ? ? //發(fā)送請(qǐng)求,并執(zhí)行
? ? ? ? ? ? ? ? CloseableHttpResponse response = httpClient.execute(httpGet);
? ? ? ? ? ? ? ? InputStream in = response.getEntity().getContent();
? ? ? ? ? ? ? ? String html = Utils.convertStreamToString(in);
? ? ? ? ? ? ? ? //網(wǎng)頁(yè)內(nèi)容解析
? ? ? ? ? ? ? ? new Thread(new JianDanHtmlParser(html, i)).start();
? ? ? ? ? ? }catch (Exception e) {
e.printStackTrace();
? ? ? ? ? ? }
}
}
}
工具類(lèi)
package com.wxq.pachong;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
/**
* @title:
* @description:
* @author:
* @date:2019/3/11 15:20
**/
public class Utils {
public static StringconvertStreamToString(InputStream is) {
BufferedReader reader =new BufferedReader(new InputStreamReader(is));
? ? ? ? StringBuilder sb =new StringBuilder();
? ? ? ? String line =null;
? ? ? ? try {
while ((line = reader.readLine()) !=null) {
sb.append(line +"\n");
? ? ? ? ? ? }
}catch (IOException e) {
e.printStackTrace();
? ? ? ? }finally {
try {
is.close();
? ? ? ? ? ? }catch (IOException e) {
e.printStackTrace();
? ? ? ? ? ? }
}
return sb.toString();
? ? }
}