1.創建一個maveng工程
pom文件如下設置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.hua.cn</groupId>
<artifactId>crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.37</version>
</dependency>
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>3.4.2</version>
</dependency>
<!--mapper封裝了單表的增刪改查,有了他就不用自己寫了-->
<!-- https://mvnrepository.com/artifact/tk.mybatis/mapper -->
<dependency>
<groupId>tk.mybatis</groupId>
<artifactId>mapper</artifactId>
<version>4.1.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup 爬蟲庫 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
</dependencies>
</project>
2.創建BaiDuNews實體類
package com.hua.po;
import javax.persistence.Column;
import javax.persistence.Table;
/**
* Created by hua on 2019/3/31.
* @description: 百度實時熱點的po對象
*/
@Table(name = "baidu_news")
public class BaiDuNews {
private Integer id;
private String keyword;
private String type;
private String clazz;
@Column(name="search_num")
private Integer searchNum;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getKeyword() {
return keyword;
}
public void setKeyword(String keyword) {
this.keyword = keyword;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getClazz() {
return clazz;
}
public void setClazz(String clazz) {
this.clazz = clazz;
}
public Integer getSearchNum() {
return searchNum;
}
public void setSearchNum(Integer searchNum) {
this.searchNum = searchNum;
}
}
3.創建與之對應的數據庫
CREATE TABLE `baidu_news` (
`id` int(11) NOT NULL COMMENT '主鍵自增',
`keyword` varchar(255) DEFAULT NULL COMMENT '關鍵字',
`type` varchar(255) DEFAULT NULL COMMENT '類型',
`clazz` varchar(255) DEFAULT NULL COMMENT '新聞相關性',
`search_num` int(11) DEFAULT NULL COMMENT '搜索指數',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
4.創建mybatis配置文件
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
<!-- 開啓駝峯自動映射 -->
<settings>
<!--<setting name="mapUnderscoreToCamelCase" value="false"/>-->
<setting name="defaultStatementTimeout" value="60"/>
</settings>
<typeAliases>
<package name="com.itcast.po"/>
</typeAliases>
<!-- 配置環境,制定數據庫連接信息 -->
<environments default="local">
<environment id="local">
<transactionManager type="JDBC"/>
<dataSource type="POOLED">
<property name="driver" value="com.mysql.jdbc.Driver"/>
<property name="url" value="jdbc:mysql://127.0.0.1:3306/hua_crawler?useSSL=false&serverTimezone=GMT%2B8"/>
<property name="username" value="root"/>
<property name="password" value="root"/>
</dataSource>
</environment>
</environments>
<mappers>
<package name="com.hua.mapper"/>
</mappers>
</configuration>
5.創建獲取SqlSession的util類
package com.hua.util;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import tk.mybatis.mapper.common.Mapper;
import tk.mybatis.mapper.common.MySqlMapper;
import tk.mybatis.mapper.entity.Config;
import tk.mybatis.mapper.mapperhelper.MapperHelper;
/**
* Created by hua on 2019/3/31.
*/
public class MybatisHelper {
private static SqlSessionFactory sqlSessionFactoryLocal;
static {
try {
sqlSessionFactoryLocal = new SqlSessionFactoryBuilder().build(Resources.getResourceAsReader("mybatis-config.xml"), "local");
SqlSession sessionLocal = null;
try {
sessionLocal = sqlSessionFactoryLocal.openSession();
MapperHelper mapperHelper = new MapperHelper();
Config config = new Config();
config.setEnableMethodAnnotation(true);
config.setNotEmpty(true);
mapperHelper.setConfig(config);
mapperHelper.registerMapper(Mapper.class);
mapperHelper.registerMapper(MySqlMapper.class);
mapperHelper.processConfiguration(sessionLocal.getConfiguration());
} catch (Exception e) {
e.printStackTrace();
} finally {
if (sessionLocal != null) {
sessionLocal.close();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static SqlSession getSqlSessionLocal() {
return sqlSessionFactoryLocal.openSession();
}
}
6.創建接口BaiduNewsMapper並繼承Mapper(裏面封裝了增刪改查)
package com.hua.mapper;
import com.hua.po.BaiDuNews;
import tk.mybatis.mapper.common.Mapper;
/**
* Created by hua on 2019/3/31.
*/
public interface BaiduNewsMapper extends Mapper<BaiDuNews> {
}
7.創建主功能類Day01_BaiduNewsCrawler
package com.hua.main;
import com.hua.mapper.BaiduNewsMapper;
import com.hua.po.BaiDuNews;
import com.hua.util.MybatisHelper;
import org.apache.ibatis.session.SqlSession;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
/**
* Created by hua on 2019/3/31.
* @description:抓取百度時事熱點
*/
public class Day01_BaiduNewsCrawler {
public static void main(String[] args) throws IOException {
// 獲取sqlSession
SqlSession sqlSession = MybatisHelper.getSqlSessionLocal();
// 注入要操作的表的mapper, 方便單表操作
BaiduNewsMapper baiduNewsMapper = sqlSession.getMapper(BaiduNewsMapper.class);
//查詢表中數據
/* List<BaiDuNews> baiduNews = baiduNewsMapper.selectAll();
for(BaiDuNews news : baiduNews){
System.out.print(news.getId());
}
sqlSession.close();*/
// 分析網站結構, 獲取數據併入庫
// 要抓取的url
String url = "http://top.baidu.com/buzz?b=1";
// 使用爬蟲庫獲取
Document doc = Jsoup.connect(url).get();
getElementAndInsert(doc, baiduNewsMapper, "實時熱點");
// 獲取新聞列表
Elements lis = doc.select("#flist li");
for (int i = 2; i < lis.size(); i++) {
Element li = lis.get(i);
// title = type
String title = li.select("a").attr("title");
String href = "http://top.baidu.com" + li.select("a").attr("href").substring(1);
doc = Jsoup.connect(href).get();
getElementAndInsert(doc, baiduNewsMapper, title);
}
// 提交事務
sqlSession.commit();
// 關流
sqlSession.close();
}
public static void getElementAndInsert(Document doc, BaiduNewsMapper baiduNewsMapper, String type) {
// 使用元素選擇器獲取對應的html元素
Elements trs = doc.select("#main > div.mainBody > div > table tr");
// 遍歷獲取需要值
for (Element tr : trs) {
// 獲取頁面上需要值
String keyword = tr.select(".list-title").text();
String clazz = tr.select(".tc").text();
// 這個變量控制數據是否可用
String tempNum = tr.select(".last").text();
int num;
try {
num = Integer.parseInt(tempNum);
} catch (NumberFormatException e) {
System.out.println("Debug");
// 非數字的字符串, 全部跳過
continue;
}
// 封裝到po對象中
BaiDuNews baiduNews = new BaiDuNews();
baiduNews.setKeyword(keyword);
baiduNews.setClazz(clazz);
baiduNews.setSearchNum(num);
// 數據還未獲取, 待完成
baiduNews.setType(type);
// 入庫
baiduNewsMapper.insert(baiduNews);
System.out.println("入庫: " + keyword);
}
}
}
8.最後的項目結構是
9.運行程序,測試結果