一、項(xiàng)目框架
二碘赖、代碼實(shí)現(xiàn)
1驾荣、數(shù)據(jù)庫建表
1)、創(chuàng)建動(dòng)態(tài)信息表
DROP TABLE IF EXISTS `user_dymatic_info`;
CREATE TABLE `user_dymatic_info` (
`_id` int(11) NOT NULL AUTO_INCREMENT,
`content` longtext NOT NULL,
`time` varchar(60) NOT NULL DEFAULT '',
`slug` varchar(60) NOT NULL DEFAULT '',
`dymatic_type` varchar(60) NOT NULL DEFAULT '',
`extra_content` varchar(500) NOT NULL DEFAULT '',
PRIMARY KEY (`_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
2)普泡、創(chuàng)建用戶信息表
DROP TABLE IF EXISTS `user_information`;
CREATE TABLE `user_information` (
`id` varchar(255) NOT NULL primary key,
`follow` varchar(255) NOT NULL ,
`follower` varchar(255) NOT NULL ,
`article` varchar(255) NOT NULL ,
`words` varchar(255) NOT NULL ,
`like` varchar(255) NOT NULL
)
;
2播掷、jsoup爬蟲以及數(shù)據(jù)寫入數(shù)據(jù)庫
使用maven創(chuàng)建java項(xiàng)目
1)、添加如下依賴:
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.56</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
2)撼班、編寫爬蟲以及寫入數(shù)據(jù)庫代碼
package com.neusoft;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.sql.*;
public class Jianshu {
public static void jsoup(String id) throws IOException, SQLException, ClassNotFoundException {
Document doc = Jsoup.connect("http://www.reibang.com/users/"+id+"/timeline?page=1")
.userAgent("Mozilla")
.timeout(10000000)
.get();
String href = doc.select("a.nickname").attr("href");
// 用戶id
String slug=href.substring(href.lastIndexOf("/")+1);
System.out.println(slug);
// max_id
String max_id="";
// 連接數(shù)據(jù)庫
Connection conn=null;
PreparedStatement stmt= null;
String jdbcurl = "jdbc:mysql://localhost:3306/test";
String user = "root";
String password = "root" ; //數(shù)據(jù)庫密碼
Class.forName("com.mysql.jdbc.Driver");
conn = DriverManager.getConnection(jdbcurl, user, password);
// 采集用戶信息
// 關(guān)注
String follow="";
// 粉絲
String follower="";
// 文章
String article="";
// 字?jǐn)?shù)
String words="";
// 收獲喜歡
String like="";
// 獲得當(dāng)前用戶基本信息集合
Document doc1 = Jsoup.connect("http://www.reibang.com/users/" + id + "/timeline?page=1").get();
Elements eles = doc1.select("div.info li");
int i=0;
// 遍歷當(dāng)前用戶信息集合歧匈,按順序依次賦值
for (Element el:eles) {
if (i==0){
follow=el.select("p").text();
}else if (i==1){
follower=el.select("p").text();
}else if (i==2){
article=el.select("p").text();
}else if (i==3){
words=el.select("p").text();
}else if (i==4){
like=el.select("p").text();
}else break;
i++;
}
// 查詢當(dāng)前用戶是否存在
String usersql="select * from user_information where id=?";
System.out.println(usersql);
stmt=conn.prepareStatement(usersql);
stmt.setString(1,slug);
ResultSet rs1 = stmt.executeQuery();
// 判斷當(dāng)前用戶是否已經(jīng)存在,如果存在就更新數(shù)據(jù)砰嘁,如果不存在就新建用戶
if (rs1.next()){
String userupdate="UPDATE user_information SET follow=?,follower=?,article=?,words=?,`like`=? WHERE id =?";
System.out.println(userupdate);
stmt=conn.prepareStatement(userupdate);
stmt.setString(1,follow);
stmt.setString(2,follower);
stmt.setString(3,article);
stmt.setString(4,words);
stmt.setString(5,like);
stmt.executeUpdate();
}else {
String userinsert="insert into user_information(id,follow,follower,article,words,`like`) values (?,?,?,?,?,?)";
System.out.println(userinsert);
stmt=conn.prepareStatement(userinsert);
stmt.setString(1,slug);
stmt.setString(2,follow);
stmt.setString(3,follower);
stmt.setString(4,article);
stmt.setString(5,words);
stmt.setString(6,like);
stmt.executeUpdate();
}
int page = 1;
out:while (true) {
String url;
if(page==1){
url="http://www.reibang.com/users/"+id+"/timeline?page=1";
}else {
url="http://www.reibang.com/users/"+id+"/timeline?max_id="+max_id+"&page="+page;
}
Document document = Jsoup.connect(url)
.userAgent("Mozilla")
.timeout(10000000)
.get();
// 每一條動(dòng)態(tài)里面的內(nèi)容
Elements ele = document.select("div#list-container li");
if(ele==null||ele.size()<=0){
break;
}
for (Element e:ele) {
// 動(dòng)態(tài)類型
String dymatic_type="";
// 時(shí)間
String time = "";
// 評論內(nèi)容
String content="";
// 被評論的文章
String extra_content="";
// 取得max_id用作翻頁
String index=e.attr("id");
max_id = Integer.parseInt(index.substring(index.indexOf("-") + 1)) - 1 + "";
// 時(shí)間
String time8 = e.select("span").attr("data-datetime");
time = time8.substring(0, time8.indexOf("+"));
// System.out.println(time);
// 動(dòng)態(tài)類型
dymatic_type = e.select("div.info span").attr("data-type");
// 判斷動(dòng)態(tài)類型是否是評論類型
if(dymatic_type.equals("comment_note"))
{
content=e.select("p.comment").text();
String str=e.select("a.title").attr("href");
extra_content=str.substring(str.lastIndexOf("/")+1);
}
// 斷點(diǎn)續(xù)爬件炉,從最新的開始爬,直到爬到數(shù)據(jù)庫里面存儲(chǔ)的最新的一天和剛爬的數(shù)據(jù)一樣矮湘,停止爬蟲
String exit="select * from user_dymatic_info where time=? and slug=? and dymatic_type=?";
System.out.println(exit);
stmt = conn.prepareStatement(exit);
stmt.setString(1, time);
stmt.setString(2, slug);
stmt.setString(3, dymatic_type);
ResultSet resultSet = stmt.executeQuery();
if (resultSet.next()){
break out;
}
// 將爬取到的數(shù)據(jù)存入數(shù)據(jù)庫
String sql = "insert into user_dymatic_info (content,time,slug,dymatic_type,extra_content) values (?,?,?,?,?)";
stmt = conn.prepareStatement(sql);
stmt.setString(1, content);
stmt.setString(2, time);
stmt.setString(3, slug);
stmt.setString(4, dymatic_type);
stmt.setString(5, extra_content);
stmt.executeUpdate();
}
System.out.println("-----------------------------");
System.out.println(max_id);
// 翻頁
page++;
}
}
public static void main(String[] args) throws SQLException, IOException, ClassNotFoundException {
jsoup("d99a7dfae9e4");
}
}