這是一個基于WebMagic + SpringBoot開發(fā)的一個簡單的爬蟲案例,主要爬取前程無憂的招聘數(shù)據(jù)保存到Mysql數(shù)據(jù)庫庶橱!
數(shù)據(jù)圖
WebMagic簡介
WebMagic是一個簡單靈活的Java爬蟲框架』菰常基于WebMagic,你可以快速開發(fā)出一個高效、易維護的爬蟲。
mysql表結構
/*
Navicat MySQL Data Transfer
Source Server : 本機數(shù)據(jù)庫
Source Server Version : 80017
Source Host : localhost:3306
Source Database : crawler
Target Server Type : MYSQL
Target Server Version : 80017
File Encoding : 65001
Date: 2019-12-08 23:36:09
*/
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for jobs_item
-- ----------------------------
DROP TABLE IF EXISTS `jobs_item`;
CREATE TABLE `jobs_item` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主鍵ID',
`company_name` varchar(100) DEFAULT NULL COMMENT '公司名稱',
`company_addr` varchar(255) DEFAULT NULL COMMENT '公司地址',
`company_info` text COMMENT '公司簡介',
`job_name` varchar(100) DEFAULT NULL COMMENT '工作名稱',
`job_num` int(11) DEFAULT '0' COMMENT '招聘人數(shù)',
`job_addr` varchar(255) DEFAULT NULL COMMENT '工作地址',
`job_info` text COMMENT '工作簡介',
`diploma` varchar(20) DEFAULT NULL COMMENT '文憑',
`salary_min` bigint(10) DEFAULT NULL COMMENT '最小月薪',
`salary_max` bigint(10) DEFAULT NULL COMMENT '最多月薪',
`url` varchar(100) DEFAULT NULL COMMENT '招聘信息詳情頁',
`time` varchar(20) DEFAULT NULL COMMENT '職位最近發(fā)布時間',
`created` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
`updated` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2844 DEFAULT CHARSET=utf8;
項目目錄圖
image.png
添加pom.xml依賴
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.2.1.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>club.studycode</groupId>
<artifactId>qcwy-crawler</artifactId>
<version>1.0.0-SNAPSHOT</version>
<name>qcwy-crawler</name>
<description>JOBS</description>
<properties>
<java.version>1.8</java.version>
<mapper.version>2.1.5</mapper.version>
<webmagic.version>0.7.3</webmagic.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>tk.mybatis</groupId>
<artifactId>mapper-spring-boot-starter</artifactId>
<version>${mapper.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<!--WebMagic 核心依賴-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
<version>${webmagic.version}</version>
</dependency>
<!--WebMagic 擴展依賴-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${webmagic.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>28.1-jre</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
MyMapper類 定義一個tkMybatis的總接口
package club.studycode.mapper;
import tk.mybatis.mapper.common.Mapper;
import tk.mybatis.mapper.common.MySqlMapper;
public interface MyMapper<T> extends Mapper<T>, MySqlMapper<T> {
}
JobsItemDao接口 繼承MyMapper 相當于有了基本的curd方法
package club.studycode.qcwy.crawler.dao;
import club.studycode.mapper.MyMapper;
import club.studycode.qcwy.crawler.entity.JobsItem;
import org.springframework.stereotype.Repository;
/**
* @ClassName: JobsItemDao.java
* @Author: Slayer
* @Date: 2019/11/16 0:59
* @Description:
*/
@Repository
public interface JobsItemDao extends MyMapper<JobsItem> {
}
JobsItem Entity類
package club.studycode.qcwy.crawler.entity;
import java.io.Serializable;
import java.util.Date;
import lombok.Data;
import javax.persistence.Column;
import javax.persistence.Id;
import javax.persistence.Table;
/**
* @ClassName: QcwyItem.java
* @Author: Slayer
* @Date: 2019/11/16 0:51
* @Description:
*/
@Data
@Table(name = "jobs_item")
public class JobsItem implements Serializable {
private static final long serialVersionUID = -1274246480063610692L;
// 主鍵ID
@Id
@Column(name = "id")
private Long id;
// 公司名稱
@Column(name = "company_name")
private String companyName;
// 公司地址
@Column(name = "company_addr")
private String companyAddr;
// 公司簡介
@Column(name = "company_info")
private String companyInfo;
// 工作名稱
@Column(name = "job_name")
private String jobName;
// 招聘人數(shù)
@Column(name = "job_num")
private Integer jobNum;
// 工作地址
@Column(name = "job_addr")
private String jobAddr;
// 工作簡介
@Column(name = "job_info")
private String jobInfo;
// 文憑
@Column(name = "diploma")
private String diploma;
// 最小月薪
@Column(name = "salary_min")
private Integer salaryMin;
// 最多月薪
@Column(name = "salary_max")
private Integer salaryMax;
// 招聘信息詳情頁
@Column(name = "url")
private String url;
// 職位最近發(fā)布時間
@Column(name = "time")
private String time;
@Column(name = "created")
private Date created;
@Column(name = "updated")
private Date updated;
}
JobsItemService 業(yè)務層接口
package club.studycode.qcwy.crawler.service;
import club.studycode.qcwy.crawler.entity.JobsItem;
/**
* @ClassName: JobsItemService.java
* @Author: Slayer
* @Date: 2019/12/8 23:42
* @Description:
*/
public interface JobsItemService {
void save(JobsItem jobsItem);
JobsItem getByCompanyName(String companyName);
}
JobsItemServiceImpl 業(yè)務實現(xiàn)類
package club.studycode.qcwy.crawler.service.impl;
import club.studycode.qcwy.crawler.dao.JobsItemDao;
import club.studycode.qcwy.crawler.entity.JobsItem;
import club.studycode.qcwy.crawler.service.JobsItemService;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import tk.mybatis.mapper.entity.Example;
import javax.annotation.Resource;
@Transactional(readOnly = true)
@Service
public class JobsItemServiceImpl implements JobsItemService {
@Resource
private JobsItemDao jobsItemDao;
/**
* 保存數(shù)據(jù)
*
* @param jobsItem
*/
@Override
@Transactional(readOnly = false)
public void save(JobsItem jobsItem) {
// 新增數(shù)據(jù)
if (jobsItem.getId() == null) {
jobsItemDao.insert(jobsItem);
}
//更新數(shù)據(jù)
else {
jobsItemDao.updateByPrimaryKey(jobsItem);
}
}
/**
* 根據(jù)名稱獲取數(shù)據(jù)
*
* @param companyName
* @return
*/
@Override
public JobsItem getByCompanyName(String companyName) {
Example example = new Example(JobsItem.class);
example.createCriteria().andEqualTo("companyName", companyName);
return jobsItemDao.selectOneByExample(example);
}
}
JobProcessor WebMagic 核心類 主要定義要爬取數(shù)據(jù)的規(guī)則
package club.studycode.qcwy.crawler.task;
import club.studycode.qcwy.crawler.entity.JobsItem;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable;
import java.util.Date;
import java.util.List;
/**
* @ClassName: JobProcessor.java
* @Author: Slayer
* @Date: 2019/12/8 23:43
* @Description:
*/
@Component
public class JobProcessor implements PageProcessor {
private static final String URL_CRAWLER = "https://search.51job.com/list/000000,000000,0000,00,9,99,java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
// private static final String URL_CRAWLER = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E9%2594%2580%25E5%2594%25AE,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
private long num = 1;
@Override
public void process(Page page) {
List<Selectable> selectables = page.getHtml().css("div#resultList div.el").nodes();
// 判斷獲取到的集合是否為空
if (selectables.size() == 0) {
// 為空則表示這是招聘詳情頁
this.saveJobInfo(page);
}
// 不為空椎麦,表示這是詳情頁,解析出詳情頁的url地址材彪,放到任務中
else {
selectables.forEach(selectable -> {
// 獲取詳情頁url地址
String jobInfoUrl = selectable.css("p.t1 > span > a[href]").links().get();
// 把獲取到的url地址放在任務隊中
page.addTargetRequest(jobInfoUrl);
});
// 獲取下一頁的url地址
String nextPage = page.getHtml().css("li.bk").nodes().get(1).links().get();
// 把獲取到的url地址放在任務隊中
page.addTargetRequest(nextPage);
}
}
/**
* 保存詳情頁內容
*
* @param page 頁面數(shù)據(jù)
*/
private void saveJobInfo(Page page) {
JobsItem jobsItem = new JobsItem();
// 設置公司名稱
String companyName = page.getHtml().css("p.cname > a", "text").get();
jobsItem.setCompanyName(companyName);
// 設置部分信息
String info = page.getHtml().css("p.msg", "text").get();
this.saveCompanyInfo(info, jobsItem);
// 設置公司信息
String companyInfo = page.getHtml().css("div.tmsg", "text").get();
jobsItem.setCompanyInfo(companyInfo);
// 設置工作名稱
String jobName = page.getHtml().css("div.cn > h1", "text").get();
jobsItem.setJobName(jobName);
// 設置工作地址
List<Selectable> jobAddrSelectables = page.getHtml().css("p.fp").nodes();
if (jobAddrSelectables.size() > 1) {
String jobAddrHtml = jobAddrSelectables.get(1).css("p.fp", "text").get();
jobsItem.setJobAddr(jobAddrHtml);
}
// 設置工作簡介
List<Selectable> jobInfoSelectables = page.getHtml().css("div.bmsg").nodes();
if (jobInfoSelectables.size() >= 1) {
String jobInfoHtml = jobInfoSelectables.get(0).get();
String jobInfo = Jsoup.parse(jobInfoHtml).text();
jobsItem.setJobInfo(jobInfo);
}
// 設置最高和最低月薪
String salary = page.getHtml().css("div.cn > strong", "text").get();
this.saveSalary(salary, jobsItem);
// 設置招聘信息詳情頁
jobsItem.setUrl(page.getUrl().get());
jobsItem.setCreated(new Date());
jobsItem.setUpdated(jobsItem.getCreated());
// 保存數(shù)據(jù)
page.putField("jobsItem", jobsItem);
System.out.println("爬蟲次數(shù)" + this.num++);
}
private void saveSalary(String salary, JobsItem jobsItem) {
if (!StringUtils.isBlank(salary)) {
String[] split = salary.split("-");
char unit = split[1].charAt(split[1].length() - 3);
double num = 0;
switch (unit) {
case '千':
num = 1000;
break;
case '萬':
num = 10000;
break;
default:
break;
}
// 設置最低月薪
int salaryMin = (int) (Double.parseDouble(split[0]) * num);
jobsItem.setSalaryMin(salaryMin);
// 設置最多月薪
int salaryMax = (int) (Double.parseDouble(split[1].substring(0, split[1].length() - 3)) * num);
jobsItem.setSalaryMax(salaryMax);
}
}
private void saveCompanyInfo(String companyInfo, JobsItem jobsItem) {
if (!StringUtils.isBlank(companyInfo)) {
String[] companyInfos = companyInfo.split(" ");
// 設置公司地址
jobsItem.setCompanyAddr(companyInfos[0].trim());
// 設置公司招聘人數(shù)
if ("招若干人".equals(companyInfos[3].trim())) {
jobsItem.setJobNum(9999);
} else {
String num = companyInfos[3].replaceAll("[^0-9]", "");
if (!StringUtils.isBlank(num)) {
jobsItem.setJobNum(Integer.parseInt(num));
}
}
// 設置文憑
if (companyInfos[2].contains("招")) {
jobsItem.setDiploma("無學歷");
String num = companyInfos[2].replaceAll("[^0-9]", "");
if (!StringUtils.isBlank(num)) {
jobsItem.setJobNum(Integer.parseInt(num));
}
} else {
jobsItem.setDiploma(companyInfos[2].trim());
}
// 設置職位最近發(fā)布時間
for (String time : companyInfos) {
if (!StringUtils.isBlank(time)) {
if (time.indexOf("發(fā)布") > 0) {
time = time.replace("發(fā)布", "");
jobsItem.setTime(time);
}
}
}
}
}
private Site site = Site.me()
// 設置字符編碼
.setCharset("gbk")
// 設置超時時間
.setTimeOut(10 * 1000)
// 設置重試間隔
.setRetrySleepTime(3 * 1000)
// 設置重試次數(shù)
.setSleepTime(3);
@Override
public Site getSite() {
return site;
}
@Autowired
private SaveDataPipeline saveDataPipeline;
/**
* initialDelay當任務啟動后观挎,等多久在執(zhí)行
* fixedDelay每隔多久執(zhí)行一次
*/
@Scheduled(initialDelay = 1000, fixedDelay = 10 * 1000)
public void process() {
Spider.create(new JobProcessor())
.addUrl(URL_CRAWLER)
// 使用BloomFilter來進行去重撒桨,占用內存較小,但是可能漏抓頁面 //100000是估計的頁面數(shù)量
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
.thread(1)
// 設置輸出位置
.addPipeline(this.saveDataPipeline)
.run();
}
}
SaveDataPipeline 主要把爬取的數(shù)據(jù)保存到數(shù)據(jù)庫中
package club.studycode.qcwy.crawler.task;
import club.studycode.qcwy.crawler.entity.JobsItem;
import club.studycode.qcwy.crawler.service.JobsItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @ClassName: SaveDataPipeline.java
* @Author: Slayer
* @Date: 2019/12/8 23:46
* @Description:
*/
@Component
public class SaveDataPipeline implements Pipeline {
@Autowired
private JobsItemService jobsItemService;
@Override
public void process(ResultItems resultItems, Task task) {
// 獲取數(shù)據(jù)
JobsItem jobsItem = resultItems.get("jobsItem");
// 判斷獲取的數(shù)據(jù)是否符合
if (jobsItem != null && jobsItem.getCompanyName() != null) {
// 查詢是否有重復的
JobsItem resultJobsItem = jobsItemService.getByCompanyName(jobsItem.getCompanyName());
// 有則更新
if (resultJobsItem != null) {
jobsItem.setId(resultJobsItem.getId());
System.out.println("-----------------------------更新數(shù)據(jù)啦----------------------------------");
}
jobsItemService.save(jobsItem);
}
}
}
QcwyCrawlerApplication SpringBoot啟動類
package club.studycode.qcwy.crawler;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.transaction.annotation.EnableTransactionManagement;
import tk.mybatis.spring.annotation.MapperScan;
@SpringBootApplication
@MapperScan(basePackages = "club.studycode.qcwy.crawler.dao")
@EnableScheduling
@EnableTransactionManagement
public class QcwyCrawlerApplication {
public static void main(String[] args) {
SpringApplication.run(QcwyCrawlerApplication.class, args);
}
}
application.yaml配置
spring:
datasource:
type: com.zaxxer.hikari.HikariDataSource
driver-class-name: com.mysql.cj.jdbc.Driver
url: jdbc:mysql://localhost:3306/crawler?useUnicode=true&characterEncoding=utf-8&useSSL=false&serverTimezone=Asia/Shanghai
username: root
password: "020822"
hikari:
minimum-idle: 5
idle-timeout: 600000
maximum-pool-size: 10
auto-commit: true
pool-name: MyHikariCP
max-lifetime: 1800000
connection-timeout: 30000
connection-test-query: SELECT 1
mybatis:
type-aliases-package: club.studycode.qcwy.crawler.entity