學習Java爬蟲Day02-抓取v2ex標題

1.繼續在昨天的工程上學習,所以就不用再寫配置文件pom文件,myBatis和util

首先創建數據庫表和實體類V2ex

CREATE TABLE `v2ex` (
  `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主鍵',
  `title` varchar(255) DEFAULT NULL COMMENT '標題',
  `url` varchar(255) DEFAULT NULL COMMENT '地址',
  `user` varchar(255) DEFAULT NULL COMMENT '發帖的用戶',
  `type` varchar(255) DEFAULT NULL COMMENT '大分類',
  `clazz` varchar(255) DEFAULT NULL COMMENT '小分類',
  `up_time` varchar(255) DEFAULT NULL COMMENT '發帖相對時間',
  `reply_num` int(11) DEFAULT NULL COMMENT '回覆數量',
  `crawler_time` datetime DEFAULT NULL COMMENT '抓取時間',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1945 DEFAULT CHARSET=utf8;
package com.hua.po;

import javax.persistence.Column;
import javax.persistence.Table;
import java.util.Date;

/**
 * Created by hua on 2019/3/31.
 */

@Table(name = "v2ex")
public class V2ex {
    private Integer id;
    private String title;
    private String url;
    private String user;
    private String type;
    private String clazz;
    @Column(name = "up_time")
    private String upTime;
    @Column(name = "crawler_time")
    private Date crawlerTime;
    @Column(name = "reply_num")
    private Integer replyNum;

    public Integer getId() {


        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getUser() {
        return user;
    }

    public void setUser(String user) {
        this.user = user;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getClazz() {
        return clazz;
    }

    public void setClazz(String clazz) {
        this.clazz = clazz;
    }

    public String getUpTime() {
        return upTime;
    }

    public void setUpTime(String upTime) {
        this.upTime = upTime;
    }

    public Date getCrawlerTime() {
        return crawlerTime;
    }

    public void setCrawlerTime(Date crawlerTime) {
        this.crawlerTime = crawlerTime;
    }

    public Integer getReplyNum() {
        return replyNum;
    }

    public void setReplyNum(Integer replyNum) {
        this.replyNum = replyNum;
    }
}

2.創建主方法Day02_V2exCrawler

package com.hua.main;

import com.hua.mapper.V2exMapper;
import com.hua.po.V2ex;
import com.hua.util.MybatisHelper;
import org.apache.ibatis.session.SqlSession;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.Date;

/**
 * Created by hua on 2019/3/31.
 */
public class Day02_V2exCrawler {
    public static void main(String[] args) throws IOException {
        // 獲取數據庫操作對象
        SqlSession sqlSession = MybatisHelper.getSqlSessionLocal();
        V2exMapper v2exMapper = sqlSession.getMapper(V2exMapper.class);
        // 目標網站url
        String url = "https://www.v2ex.com";
        // 獲取網頁文檔對象
        Document doc = Jsoup.connect(url).get();
        Elements as = doc.select("#Tabs a");
        for (Element a : as) {
            // 大分類url
            String href = url + a.attr("href");
            // 跳過這兩個分類, 原因這兩個分類裏面的內容是其他分類的聚合, 抓取了其他分類, 這兩分類已包括
            if (href.equals("/?tab=hot") || href.equals("/?tab=all")) {
                continue;
            }
            // 大分類的值
            String type = a.text();
            // 獲取大分類所有文章
            Document typeDoc = Jsoup.connect(href).get();
            Elements divs = typeDoc.select("div.cell.item");
            for (Element div : divs) {
                // 要保存的文章數據
                V2ex v2ex = new V2ex();
                // 獲取url 並賦值
                String href1 = url + div.select("span.item_title a").attr("href");
                v2ex.setUrl(href1);
                // 獲取標題並賦值
                String title = div.select("span.item_title a").text().replaceAll("[\\x{10000}-\\x{10FFFF}]", "");
                System.out.println(title);
                v2ex.setTitle(title);
                // 爲大分類賦值
                v2ex.setType(type);
                String[] temp = div.select("span.topic_info").text().trim().replaceFirst("[1-9]","").trim().split(" ");
                try {
                    // 獲取小分類並賦值
                    v2ex.setClazz(temp[0]);
                    // 獲取用戶名並賦值
                    v2ex.setUser(temp[2]);
                    // 獲取發帖的相對時間
                    String  time = temp[5];
                    if (time.equals("小時")) {
                        v2ex.setUpTime(temp[4] + temp[5] + temp[6] + temp[7]);
                    } else {
                        v2ex.setUpTime(temp[4] + temp[5]);
                    }
                }catch (Exception e){
                }
                //  設置抓取時間
                v2ex.setCrawlerTime(new Date());

                // 獲取回帖數並賦值
                String tempReplyNum = div.select("a.count_livid").text();
                Integer replyNum = null;
                try {
                    replyNum = Integer.parseInt(tempReplyNum);
                } catch (NumberFormatException e) {
                }
                v2ex.setReplyNum(replyNum);
                v2exMapper.insert(v2ex);
            }
        }
        // 提交事務
        sqlSession.commit();
        // 關流 -> 關閉連接
        sqlSession.close();
    }
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章