在酒店隔離第10天
凌晨的廈門還是有些涼,我把黑色衛(wèi)衣的帽子戴上
仿佛我是名在網(wǎng)絡(luò)上為維護(hù)世界和平而重拳出擊的黑客......
直到看了看我的發(fā)量.. 想想算了還是當(dāng)個(gè)普通人比較好
前陣子學(xué)了爬蟲,一直沒用來(lái)實(shí)戰(zhàn)
突然想試試, 我說(shuō)可以, 很快啊
上來(lái)就是一左重蹬 右鞭腿 左刺拳..
爬蟲有啥用
大數(shù)據(jù)時(shí)代, 爬蟲用來(lái)獲取互聯(lián)網(wǎng)中的有價(jià)值的數(shù)據(jù)
比如爬取微博 積木的小姐姐啦
分析pronhub有沒有你女朋友的視頻啦
爬到的數(shù)據(jù)庫(kù)截圖
image.png
建表語(yǔ)句
/*
Source Server Type : MySQL
Source Server Version : 50730
*/
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
DROP TABLE IF EXISTS `jd_item`;
CREATE TABLE `jd_item` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主鍵',
`create_time` timestamp NULL DEFAULT NULL COMMENT '創(chuàng)建時(shí)間',
`type` bigint(1) DEFAULT NULL COMMENT '類型',
`sku` bigint(20) DEFAULT NULL COMMENT '最小商品單元',
`spu` bigint(20) DEFAULT NULL COMMENT '聚合單元',
`item_name` text COLLATE utf8mb4_unicode_ci COMMENT '商品名',
`img` text COLLATE utf8mb4_unicode_ci COMMENT '圖片url',
PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=348 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='用戶';
SET FOREIGN_KEY_CHECKS = 1;
建立項(xiàng)目, 上代碼
創(chuàng)建maven工程,引入spring boot
pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.4.1</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.example</groupId>
<artifactId>demo</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>JingDongGetDemo</name>
<description>JingDongGetDemo</description>
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-quartz</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.9</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
</project>
封裝http請(qǐng)求類
package com.example.demo.utils;
import lombok.extern.slf4j.Slf4j;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.apache.tomcat.jni.OS;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
/**
* 封裝http請(qǐng)求類
*/
@Component
@Slf4j
public class HttpUtils {
String cookie = "__jdu=15964473360901187258403; shshshfpa=fadd29f9-813e-e8c5-fad1-5e70d45f5276-1596617912; shshshfpb=lgnFkT9%20fxrnZdfzawNuExA%3D%3D; o2State={%22webp%22:true}; pinId=sAzz8XNBRxQF0-GrxufAZLV9-x-f3wj7; pin=jd_49802f10b0cfc; unick=jd_49802f10b0cfc; _tp=yfPXFZws%2F1K7xphvbgdzaamZ1L9rpl0RAQIHmzRCPQM%3D; _pst=jd_49802f10b0cfc; user-key=a109a42d-1a2b-49d3-bd98-53783efeaa94; areaId=18; ipLoc-djd=18-1482-1485-49034; cn=20; unpl=V2_ZzNtbUtVQUFwDhZQeBpfVmIFGwhLAkIdcAhAU3lLXAViV0UKclRCFnQURldnGV8UZAIZXUFcQhxFCENkexhdBWMGEV5EVnMlMEsWBi8FXAdkAxJURlVAEXEIRVV%2bEFsAZjMRXXJWcxVxAEFRfh9eB2ACGlxCV0oTdgxPUXobbDVnCxZtQlZEFnQOQlN%2fGl41sa2GiMromqXQ3uz6rpTs0O6oxuPEZ0McdQpOVXoZWwZXAiJcchYtFXYARlR4VFwBbwQXWERVQRJ0AEdUexBaBmMKF1xAZ0Ildg%3d%3d; __jdv=122270672|www.zhihu.com|t_1001542270_1003231966_4000320817_3002725071|tuiguang|833e47a5222b478d8d0941763c114eff|1607775130483; PCSYCityID=CN_350000_350200_350203; TrackID=1vODXrK7bZO2EpDh1Nm6sfAqCk-Gs3dPa04wZrr88-S-XPgS5RmPdRgOkuyjgDCzx2l0bqptjiyuNbfelGhkPs7DZ-HlXTRWLdRgbeyueJDoB9TE9EWxFy0YQZ5CZBDGe; thor=762FF186473632AA10BE04F04DD8167499B0ED4286B401207CEE4E033A5989B5862D74A22CEE3390783A6D473BFD74E494C4013BBEA8D5C4D930A373DE163CDDDB5E5902ED9FF16E0B0AEAD53CB7EC3DE05C85A08AE5187A72BB58F825939F3DFB0A273D903500254B0E25CEC29CF7967E7C477315C18A139D9127C91D116828DCF057F4BDB76479B81213D06249FF700DD79CF1ED15C4E4BDCF3E9A6DFB9219; ceshi3.com=000; __jda=76161171.15964473360901187258403.1596447336.1608198065.1608206528.119; __jdb=76161171.6.15964473360901187258403|119.1608206528; __jdc=76161171; shshshfp=96550553530deab8d4b189c0755c3ff2; shshshsID=a55dfa051705e3f0f93c49319b54e32d_3_1608207381885";
/**
* 連接池管理器
*/
private PoolingHttpClientConnectionManager cm;
/**
* 構(gòu)造器
*/
public HttpUtils() {
this.cm = new PoolingHttpClientConnectionManager();
// 設(shè)置最大連接數(shù)
this.cm.setMaxTotal(100);
// 每個(gè)主機(jī)數(shù)連接數(shù)
this.cm.setDefaultMaxPerRoute(10);
}
/**
* 根據(jù)url下載數(shù)據(jù)
*
* @param url
* @return
*/
public String doGet(String url) {
// 獲取http client對(duì)象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
// 創(chuàng)建http請(qǐng)求對(duì)象 設(shè)置url地址
HttpGet httpGet = new HttpGet(url);
// httpGet.setHeader("Cookie", cookie);
httpGet.setHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
httpGet.setConfig(this.getConfig());
CloseableHttpResponse response = null;
try {
// 使用http client
response = httpClient.execute(httpGet);
// 發(fā)起請(qǐng)求獲取響應(yīng)
if (response.getStatusLine().getStatusCode() == 200) {
// 判斷響應(yīng)體是否不為空
if (response.getEntity() != null) {
String string = EntityUtils.toString(response.getEntity(), "utf-8");
return string;
} else {
return "";
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "";
}
/**
* 設(shè)置請(qǐng)求配置
*
* @return
*/
private RequestConfig getConfig() {
RequestConfig requestConfig = RequestConfig.custom()
// 創(chuàng)建連接時(shí)間
.setConnectTimeout(1000)
// 獲取連接的最長(zhǎng)時(shí)間
.setConnectionRequestTimeout(1000)
// 獲取數(shù)據(jù)時(shí)間
.setSocketTimeout(10000)
.build();
return requestConfig;
}
/**
* 下載圖片
*
* @param url
* @return 圖片硬盤地址
*/
public String doGetImg(String url) {
// 獲取http client對(duì)象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
// 創(chuàng)建http請(qǐng)求對(duì)象 設(shè)置url地址
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(this.getConfig());
CloseableHttpResponse response = null;
try {
// 使用http client
response = httpClient.execute(httpGet);
// 發(fā)起請(qǐng)求獲取響應(yīng)
if (response.getStatusLine().getStatusCode() == 200) {
// 判斷響應(yīng)體是否不為空
if (response.getEntity() != null) {
// 獲取圖片的后綴
String extImg = url.substring(url.lastIndexOf("."));
// 重命名圖片
String picName = UUID.randomUUID().toString() + extImg;
// 下載圖片
File file = new File("/Users/giaogiao/Documents/hewei/code/myCode/JingDongGetDemo/src/main/resources/static/img/" + picName);
OutputStream outputStream = new FileOutputStream(file);
response.getEntity().writeTo(outputStream);
// 返回圖片名稱
return picName;
} else {
return "";
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "";
}
}
爬蟲定時(shí)任務(wù), 核心代碼
package com.example.demo.task;
import com.example.demo.pojo.JdItem;
import com.example.demo.service.JdItemService;
import com.example.demo.utils.HttpUtils;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.sql.Date;
/**
* 定時(shí)任務(wù)
*/
@Component
@Slf4j
public class ItemTask {
@Autowired
private HttpUtils httpUtils;
@Autowired
private JdItemService jdItemService;
// 第一次完成后的間隔時(shí)間,100秒
@Scheduled(fixedDelay = 100 * 1000)
public void task() {
// http://pua.show/index.php?act=pl&id=200
log.debug("task....");
// 分頁(yè)查詢
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&page=1&s=1&click=0";
// String url = "https://movie.douban.com/top250";
String htmlString = httpUtils.doGet(url);
log.debug("htmlString:" + htmlString);
log.debug("完成");
}
/**
* 解析數(shù)據(jù)
*/
private void parse(String htmlString) {
Elements goodsLists = Jsoup.parse(htmlString).select("#J_goodsList > ul > li");
for (Element element : goodsLists) {
// 獲取所有SKU
Elements items = element.select("li.ps-item");
for (Element item : items) {
Long sku = Long.valueOf(item.select("[data-sku]").attr("data-sku"));
Long spu = null;
String color = item.select("[title]").attr("title");
String title = color + element.select("em").text();
String img = item.select("[data-lazy-img]").attr("data-lazy-img");
// 獲取spu
if (element.attr("data-spu").equals("")) {
spu = sku;
} else {
spu = Long.valueOf(element.attr("data-spu"));
}
JdItem jdItem = new JdItem();
jdItem.setCreateTime(new Date(new java.util.Date().getTime()));
jdItem.setType(0L);
jdItem.setSku(sku);
jdItem.setSpu(spu);
jdItem.setItemName(title);
jdItem.setImg(httpUtils.doGetImg("https:" + img));
jdItemService.save(jdItem);
}
}
}
}
本項(xiàng)目爬取京東只是一個(gè)引子,一個(gè)最基礎(chǔ)的案例, 將來(lái)還會(huì)想到其他比較好玩的項(xiàng)目,我再來(lái)分享
本項(xiàng)目?jī)?yōu)化和改進(jìn)的點(diǎn):
- 比較簡(jiǎn)易,沒有使用爬蟲框架 如webmagic
- 沒有多線程優(yōu)化