學習Java爬蟲Day01-抓取百度實時熱點

1.創建一個maveng工程

pom文件如下設置

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.hua.cn</groupId>
    <artifactId>crawler</artifactId>
    <version>1.0-SNAPSHOT</version>
    <dependencies>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.37</version>
        </dependency>
        <dependency>
            <groupId>org.mybatis</groupId>
            <artifactId>mybatis</artifactId>
            <version>3.4.2</version>
        </dependency>
        <!--mapper封裝了單表的增刪改查,有了他就不用自己寫了-->
        <!-- https://mvnrepository.com/artifact/tk.mybatis/mapper -->
        <dependency>
            <groupId>tk.mybatis</groupId>
            <artifactId>mapper</artifactId>
            <version>4.1.5</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup 爬蟲庫 -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>

    </dependencies>
</project>

2.創建BaiDuNews實體類

package com.hua.po;

import javax.persistence.Column;
import javax.persistence.Table;

/**
 * Created by hua on 2019/3/31.
 *  @description: 百度實時熱點的po對象
 */
@Table(name = "baidu_news")
public class BaiDuNews {
    private Integer id;
    private String keyword;
    private String type;
    private String clazz;
    @Column(name="search_num")
    private  Integer searchNum;

    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getKeyword() {
        return keyword;
    }

    public void setKeyword(String keyword) {
        this.keyword = keyword;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getClazz() {
        return clazz;
    }

    public void setClazz(String clazz) {
        this.clazz = clazz;
    }

    public Integer getSearchNum() {
        return searchNum;
    }

    public void setSearchNum(Integer searchNum) {
        this.searchNum = searchNum;
    }
}

3.創建與之對應的數據庫

CREATE TABLE `baidu_news` (
  `id` int(11) NOT NULL COMMENT '主鍵自增',
  `keyword` varchar(255) DEFAULT NULL COMMENT '關鍵字',
  `type` varchar(255) DEFAULT NULL COMMENT '類型',
  `clazz` varchar(255) DEFAULT NULL COMMENT '新聞相關性',
  `search_num` int(11) DEFAULT NULL COMMENT '搜索指數',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

4.創建mybatis配置文件

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
        PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
    <!-- 開啓駝峯自動映射 -->
    <settings>
        <!--<setting name="mapUnderscoreToCamelCase" value="false"/>-->
        <setting name="defaultStatementTimeout" value="60"/>
    </settings>

    <typeAliases>
        <package name="com.itcast.po"/>
    </typeAliases>

    <!-- 配置環境,制定數據庫連接信息 -->
    <environments default="local">
        <environment id="local">
            <transactionManager type="JDBC"/>
            <dataSource type="POOLED">
                <property name="driver" value="com.mysql.jdbc.Driver"/>
                <property name="url" value="jdbc:mysql://127.0.0.1:3306/hua_crawler?useSSL=false&amp;serverTimezone=GMT%2B8"/>
                <property name="username" value="root"/>
                <property name="password" value="root"/>
            </dataSource>
        </environment>
    </environments>

    <mappers>
        <package name="com.hua.mapper"/>
    </mappers>
</configuration>

5.創建獲取SqlSession的util類

package com.hua.util;

import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import tk.mybatis.mapper.common.Mapper;
import tk.mybatis.mapper.common.MySqlMapper;
import tk.mybatis.mapper.entity.Config;
import tk.mybatis.mapper.mapperhelper.MapperHelper;


/**
 * Created by hua on 2019/3/31.
 */
public class MybatisHelper {
    private static SqlSessionFactory sqlSessionFactoryLocal;

    static {
        try {
            sqlSessionFactoryLocal = new SqlSessionFactoryBuilder().build(Resources.getResourceAsReader("mybatis-config.xml"), "local");
            SqlSession sessionLocal = null;
            try {
                sessionLocal = sqlSessionFactoryLocal.openSession();
                MapperHelper mapperHelper = new MapperHelper();
                Config config = new Config();
                config.setEnableMethodAnnotation(true);
                config.setNotEmpty(true);
                mapperHelper.setConfig(config);
                mapperHelper.registerMapper(Mapper.class);
                mapperHelper.registerMapper(MySqlMapper.class);
                mapperHelper.processConfiguration(sessionLocal.getConfiguration());
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                if (sessionLocal != null) {
                    sessionLocal.close();
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static SqlSession getSqlSessionLocal() {
        return sqlSessionFactoryLocal.openSession();
    }
}

6.創建接口BaiduNewsMapper並繼承Mapper(裏面封裝了增刪改查)

package com.hua.mapper;
import com.hua.po.BaiDuNews;
import tk.mybatis.mapper.common.Mapper;

/**
 * Created by hua on 2019/3/31.
 */
public interface BaiduNewsMapper extends Mapper<BaiDuNews> {
}

7.創建主功能類Day01_BaiduNewsCrawler

package com.hua.main;

import com.hua.mapper.BaiduNewsMapper;
import com.hua.po.BaiDuNews;
import com.hua.util.MybatisHelper;
import org.apache.ibatis.session.SqlSession;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

/**
 * Created by hua on 2019/3/31.
 * @description:抓取百度時事熱點
 */
public class Day01_BaiduNewsCrawler {
    public static void main(String[] args) throws IOException {
    // 獲取sqlSession
        SqlSession sqlSession = MybatisHelper.getSqlSessionLocal();
        // 注入要操作的表的mapper, 方便單表操作
        BaiduNewsMapper baiduNewsMapper = sqlSession.getMapper(BaiduNewsMapper.class);
        //查詢表中數據
       /* List<BaiDuNews> baiduNews = baiduNewsMapper.selectAll();

        for(BaiDuNews news : baiduNews){
            System.out.print(news.getId());
        }
        sqlSession.close();*/
        // 分析網站結構, 獲取數據併入庫
        // 要抓取的url
        String url = "http://top.baidu.com/buzz?b=1";
        // 使用爬蟲庫獲取
        Document doc = Jsoup.connect(url).get();
        getElementAndInsert(doc, baiduNewsMapper, "實時熱點");
        // 獲取新聞列表
        Elements lis = doc.select("#flist li");
        for (int i = 2; i < lis.size(); i++) {
            Element li = lis.get(i);
            // title = type
            String title = li.select("a").attr("title");
            String href = "http://top.baidu.com" + li.select("a").attr("href").substring(1);
            doc = Jsoup.connect(href).get();
            getElementAndInsert(doc, baiduNewsMapper, title);
        }
        // 提交事務
        sqlSession.commit();
        //  關流
        sqlSession.close();
    }

    public static void getElementAndInsert(Document doc, BaiduNewsMapper baiduNewsMapper, String type) {
        // 使用元素選擇器獲取對應的html元素
        Elements trs = doc.select("#main > div.mainBody > div > table tr");
        // 遍歷獲取需要值
        for (Element tr : trs) {
            // 獲取頁面上需要值
            String keyword = tr.select(".list-title").text();
            String clazz = tr.select(".tc").text();
            // 這個變量控制數據是否可用
            String tempNum = tr.select(".last").text();
            int num;
            try {
                num = Integer.parseInt(tempNum);
            } catch (NumberFormatException e) {
                System.out.println("Debug");
                // 非數字的字符串, 全部跳過
                continue;
            }
            // 封裝到po對象中
            BaiDuNews baiduNews = new BaiDuNews();
            baiduNews.setKeyword(keyword);
            baiduNews.setClazz(clazz);
            baiduNews.setSearchNum(num);
            // 數據還未獲取, 待完成
            baiduNews.setType(type);
            //  入庫
            baiduNewsMapper.insert(baiduNews);
            System.out.println("入庫: " + keyword);
        }
    }





}

8.最後的項目結構是

9.運行程序,測試結果

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章