1.繼續在昨天的工程上學習,所以就不用再寫配置文件pom文件,myBatis和util
首先創建數據庫表和實體類V2ex
CREATE TABLE `v2ex` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主鍵',
`title` varchar(255) DEFAULT NULL COMMENT '標題',
`url` varchar(255) DEFAULT NULL COMMENT '地址',
`user` varchar(255) DEFAULT NULL COMMENT '發帖的用戶',
`type` varchar(255) DEFAULT NULL COMMENT '大分類',
`clazz` varchar(255) DEFAULT NULL COMMENT '小分類',
`up_time` varchar(255) DEFAULT NULL COMMENT '發帖相對時間',
`reply_num` int(11) DEFAULT NULL COMMENT '回覆數量',
`crawler_time` datetime DEFAULT NULL COMMENT '抓取時間',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1945 DEFAULT CHARSET=utf8;
package com.hua.po;
import javax.persistence.Column;
import javax.persistence.Table;
import java.util.Date;
/**
* Created by hua on 2019/3/31.
*/
@Table(name = "v2ex")
public class V2ex {
private Integer id;
private String title;
private String url;
private String user;
private String type;
private String clazz;
@Column(name = "up_time")
private String upTime;
@Column(name = "crawler_time")
private Date crawlerTime;
@Column(name = "reply_num")
private Integer replyNum;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getUser() {
return user;
}
public void setUser(String user) {
this.user = user;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getClazz() {
return clazz;
}
public void setClazz(String clazz) {
this.clazz = clazz;
}
public String getUpTime() {
return upTime;
}
public void setUpTime(String upTime) {
this.upTime = upTime;
}
public Date getCrawlerTime() {
return crawlerTime;
}
public void setCrawlerTime(Date crawlerTime) {
this.crawlerTime = crawlerTime;
}
public Integer getReplyNum() {
return replyNum;
}
public void setReplyNum(Integer replyNum) {
this.replyNum = replyNum;
}
}
2.創建主方法Day02_V2exCrawler
package com.hua.main;
import com.hua.mapper.V2exMapper;
import com.hua.po.V2ex;
import com.hua.util.MybatisHelper;
import org.apache.ibatis.session.SqlSession;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.Date;
/**
* Created by hua on 2019/3/31.
*/
public class Day02_V2exCrawler {
public static void main(String[] args) throws IOException {
// 獲取數據庫操作對象
SqlSession sqlSession = MybatisHelper.getSqlSessionLocal();
V2exMapper v2exMapper = sqlSession.getMapper(V2exMapper.class);
// 目標網站url
String url = "https://www.v2ex.com";
// 獲取網頁文檔對象
Document doc = Jsoup.connect(url).get();
Elements as = doc.select("#Tabs a");
for (Element a : as) {
// 大分類url
String href = url + a.attr("href");
// 跳過這兩個分類, 原因這兩個分類裏面的內容是其他分類的聚合, 抓取了其他分類, 這兩分類已包括
if (href.equals("/?tab=hot") || href.equals("/?tab=all")) {
continue;
}
// 大分類的值
String type = a.text();
// 獲取大分類所有文章
Document typeDoc = Jsoup.connect(href).get();
Elements divs = typeDoc.select("div.cell.item");
for (Element div : divs) {
// 要保存的文章數據
V2ex v2ex = new V2ex();
// 獲取url 並賦值
String href1 = url + div.select("span.item_title a").attr("href");
v2ex.setUrl(href1);
// 獲取標題並賦值
String title = div.select("span.item_title a").text().replaceAll("[\\x{10000}-\\x{10FFFF}]", "");
System.out.println(title);
v2ex.setTitle(title);
// 爲大分類賦值
v2ex.setType(type);
String[] temp = div.select("span.topic_info").text().trim().replaceFirst("[1-9]","").trim().split(" ");
try {
// 獲取小分類並賦值
v2ex.setClazz(temp[0]);
// 獲取用戶名並賦值
v2ex.setUser(temp[2]);
// 獲取發帖的相對時間
String time = temp[5];
if (time.equals("小時")) {
v2ex.setUpTime(temp[4] + temp[5] + temp[6] + temp[7]);
} else {
v2ex.setUpTime(temp[4] + temp[5]);
}
}catch (Exception e){
}
// 設置抓取時間
v2ex.setCrawlerTime(new Date());
// 獲取回帖數並賦值
String tempReplyNum = div.select("a.count_livid").text();
Integer replyNum = null;
try {
replyNum = Integer.parseInt(tempReplyNum);
} catch (NumberFormatException e) {
}
v2ex.setReplyNum(replyNum);
v2exMapper.insert(v2ex);
}
}
// 提交事務
sqlSession.commit();
// 關流 -> 關閉連接
sqlSession.close();
}
}