通過java爬蟲爬取CSDN和OSC博客

1.根據CSDN文章類型獲取對應類型的文章鏈接

public void searchCsdnUrl() throws IOException {
//        String[] Arr = {"ai","cloud", "db","career","game", "engineering","web",
//                "mobile", "iot","ops","fund", "lang", "arch", "avi", "sec","other"};
        List<String> urlList=new ArrayList<>();
        urlList.add("web");
        urlList.add("ai");
        urlList.add("cloud");
        urlList.add("db");
        urlList.add("fund");
        urlList.add("career");
        urlList.add("game");
        urlList.add("engineering");
        urlList.add("mobile");urlList.add("sec");
        urlList.add("iot");urlList.add("lang");urlList.add("arch");
        urlList.add("ops");urlList.add("avi");urlList.add("other");
        for(String type:urlList){
            String url="https://www.csdn.net/nav/"+type;
            //獲取url地址的http鏈接Connection
            Connection conn = Jsoup.connect(url)   //博客首頁的url地址
                    .userAgent("Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10") //http請求的瀏覽器設置
                    .timeout(1000)   //http連接時長
                    .method(Connection.Method.GET);  //請求類型是get請求,http請求還是post,delete等方式
            //獲取頁面的html文檔
            Document doc = conn.get();
            Element body = doc.body();
            //將爬取出來的文章封裝到Artcle中,並放到ArrayList裏面去
            List<Article> resultList = new ArrayList<Article>(100);
            Element articleListDiv = body.getElementById("feedlist_id");
            Elements articleList = articleListDiv.getElementsByClass("clearfix");
            for(Element article : articleList){
                Article articleEntity = new Article();
                //標題
                Element linkNode = (article.select("div h2 a")).get(0);
                //文章簡介
                Element desptionNode = (article.getElementsByClass("summary oneline")).get(0);
                //時間
                Element articleManageNode = (article.getElementsByClass("time")).get(0);
                //閱讀量
                Element readNum = (article.getElementsByClass("read_num")).get(0);
                Element commentNum = (article.getElementsByClass("common_num ")).get(0);
                //文章url
                articleEntity.setAddress(linkNode.attr("href"));
                articleEntity.setTitle(linkNode.text());
                articleEntity.setDesption(desptionNode.text());
                //articleEntity.setTime(new Date());
                if ("".equals(readNum.getElementsByClass("num").text())) {
                    articleEntity.setCommentNum(0);
                }else {
                    articleEntity.setReadNum(Integer.parseInt(readNum.getElementsByClass("num").text()));
                }
                if ("".equals(commentNum.getElementsByClass("num").text())) {
                    articleEntity.setCommentNum(0);
                }else {
                    articleEntity.setCommentNum(Integer.parseInt(commentNum.getElementsByClass("num").text()));
                }
                articleEntity.setStatus(0);
                articleEntity.setType(type);
                articleEntity.setTime(new Date());
                articleEntity.setBlogType("CSDN");
                //將閱讀量大於100的url存儲到數據庫
                if(articleEntity.getReadNum()>100){
                    articleDao.save(articleEntity);
                }
                System.out.println("文章閱讀數+++++++++++:" + articleEntity.getReadNum());
            }
            //遍歷輸出ArrayList裏面的爬取到的文章
            System.out.println("文章總數++++++++++++:" + articleList.size());
            System.out.println("文章絕對路勁地址:http://blog.csdn.net" + "++++++++++++++++++++++++");
        }
    }

//通過CSDN文章url獲取文章內容
public String searchCsdnUrl(String postUrl) throws IOException {

    //獲取url地址的http鏈接Connection
    Connection conn = Jsoup.connect(postUrl)   //博客首頁的url地址
            .userAgent("Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10") //http請求的瀏覽器設置
            .timeout(1000)   //http連接時長
            .method(Connection.Method.GET);  //請求類型是get請求,http請求還是post,delete等方式
    //獲取頁面的html文檔
    Document doc = conn.get();
    Element body = doc.body();
    //將爬取出來的文章封裝到Artcle中,並放到ArrayList裏面去
    List<Article> resultList = new ArrayList<Article>(100);
    Element articleListDiv = body.getElementById("content_views");
    String html="<!DOCTYPE html>\n" +
            "<html>\n" +
            "<body>"+articleListDiv.outerHtml();
    html+= "<br/>作者原文鏈接:"+"<p><a href=\""+postUrl+"\" target=\"_blank\" rel=\"noopener\">"+postUrl+"</a></p>\n" +
            "</body>\n" +
            "</html>";
    //遍歷輸出ArrayList裏面的爬取到的文章
    System.out.println("文章總數:" + postUrl);

    return html;
}

2.根據OSC文章類型獲取OSC文章鏈接

public  void saveOSCUrl() throws IOException {

        List<String> urlList=new ArrayList<>();
        urlList.add("5611447");//ai
        urlList.add("5593654");//cloud
        urlList.add("428639");//cloud
        urlList.add("5765988");//blockchain
        urlList.add("428602");//mobile
        urlList.add("428612");//web
        urlList.add("428640");//engineering
        urlList.add("429511");//game
        urlList.add("428609");//lang
        urlList.add("428610");//db
        urlList.add("428613");//ops
        urlList.add("6289115");//iot
        urlList.add("428647");//avi
        urlList.add("430381");//other
        urlList.add("428638");//arch

        for(String type:urlList){
            String url="https://www.oschina.net/blog?classification=";
            String relUrl=url+type;
            //獲取url地址的http鏈接Connection
            Connection conn = Jsoup.connect(relUrl)    //博客首頁的url地址
                    .userAgent("Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10") //http請求的瀏覽器設置
                    .timeout(1000)   //http連接時長
                    .method(Connection.Method.GET);  //請求類型是get請求,http請求還是post,delete等方式
            //獲取頁面的html文檔
            Document doc = conn.get();
            Element body = doc.body();

            //將爬取出來的文章封裝到Artcle中,並放到ArrayList裏面去
            List<Article> resultList = new ArrayList<Article>(100);

            Element articleListDiv = body.getElementById("recommendArticleList");
            Elements articleList = articleListDiv.getElementsByClass("content");
            for(Element article : articleList){
                Article articleEntity = new Article();
                //標題
                Element linkNode = (article.select("div a")).get(0);
                //文章簡介
                Element desptionNode = (article.getElementsByClass("line-clamp")).get(0);
                //Element articleManageNode = (article.getElementsByClass("time")).get(0);
                String readNum = (article.getElementsByClass("item")).get(3).text();
                Element commentNum = (article.getElementsByClass("item")).get(4);

                articleEntity.setAddress(linkNode.attr("href"));
                articleEntity.setTitle(linkNode.text());
                articleEntity.setDesption(desptionNode.text());
                // articleEntity.setTime(articleManageNode.select("span:eq(0").text());
                //閱讀量
                if("".equals(readNum)){
                    articleEntity.setReadNum(0);
                }else {
                    if(readNum.contains("K")){
                        String rNum=readNum.replaceAll("K","");
                        float redNum=Float.parseFloat(rNum);
                        double readNums=redNum*1000;
                        articleEntity.setReadNum((int)readNums);
                    }else {
                        articleEntity.setReadNum(Integer.valueOf(readNum));
                    }
                }
                //評論
                if("".equals(commentNum.text())){
                    articleEntity.setCommentNum(0);
                }else {
                    articleEntity.setCommentNum(Integer.parseInt(commentNum.text()));
                }
                if("5611447".equals(type)){articleEntity.setType("ai");}
                if("5593654".equals(type)){articleEntity.setType("cloud");}
                if("428639".equals(type)){articleEntity.setType("cloud");}
                if("5765988".equals(type)){articleEntity.setType("blockchain");}
                if("428602".equals(type)){articleEntity.setType("mobile");}
                if("428612".equals(type)){articleEntity.setType("web");}
                if("428640".equals(type)){articleEntity.setType("engineering");}
                if("429511".equals(type)){articleEntity.setType("game");}
                if("428609".equals(type)){articleEntity.setType("lang");}
                if("428610".equals(type)){articleEntity.setType("db");}
                if("428613".equals(type)){articleEntity.setType("ops");}
                if("6289115".equals(type)){articleEntity.setType("iot");}
                if("428647".equals(type)){articleEntity.setType("avi");}
                if("430381".equals(type)){articleEntity.setType("other");}
                if("428638".equals(type)){articleEntity.setType("arch");}
                articleEntity.setTime(new Date());
                articleEntity.setBlogType("OSC");
                articleEntity.setStatus(0);
//                if(articleEntity.getReadNum()>50){
                    articleDao.save(articleEntity);
//                }
                System.out.println("文章閱讀數++++++=:" + articleEntity.getReadNum());
                System.out.println("文章總數++=:" + articleEntity.getAddress());
                System.out.println("文章類型+++:" + articleEntity.getType());
            }
            //遍歷輸出ArrayList裏面的爬取到的文章
            System.out.println("文章總數:" + articleList.size());
            System.out.println("文章絕對路勁地址:oooooooooooooooo++++++++++++++++++++++++++++++");
        }
    }

//通過OSC文章鏈接獲取博客內容
public String searchOscUrl(String postUrl) throws IOException {

    //獲取url地址的http鏈接Connection
    Connection conn = Jsoup.connect(postUrl)   //博客首頁的url地址
            .userAgent("Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10") //http請求的瀏覽器設置
            .timeout(1000)   //http連接時長
            .method(Connection.Method.GET);  //請求類型是get請求,http請求還是post,delete等方式
    //獲取頁面的html文檔
    Document doc = conn.get();
    Element body = doc.body();
    //將爬取出來的文章封裝到Artcle中,並放到ArrayList裏面去
    Elements articleListDiv = body.getElementById("articleContent").children();
    articleListDiv.remove(0);
    String html="<!DOCTYPE html>\n" +
            "<html>\n" +
            "<body>"+"<div class=\"htmledit_views\" id=\"content_views\"> "+articleListDiv.outerHtml()+"</div>";
    html+= "<br/>作者原文鏈接:"+"<p><a href=\""+postUrl+"\" target=\"_blank\" rel=\"noopener\">"+postUrl+"</a></p>\n" +
            "</body>\n" +
            "</html>";
    //遍歷輸出ArrayList裏面的爬取到的文章
    System.out.println("文章總數:" + postUrl);

    return html;
}

 

3.公共bean

/**
 * 文章的JavaBean.
 * date:2017-02-09
 */
@Entity
@Table(name = "mto_csdn")
@Cache(usage = CacheConcurrencyStrategy.READ_WRITE)
public class Article  implements Serializable {

    private static final long serialVersionUID = -1153854616385727165L;

    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    @SortableField
    @NumericField
    private long id;
   /**
    * 文章鏈接的相對地址
    */
   private String address;

    /**
     * 文章標題
     */
    private String title;

    /**
     * 文章簡介
     */
    private String desption;

    /**
     * 文章發表時間
     */
    @Temporal(value = TemporalType.TIMESTAMP)
    private Date time;
    /**
     * 閱讀數
     */
    @Column(name = "read_num")
    private int readNum;
    /**
     * 評論數
     */
    @Column(name = "comment_num")
    private int commentNum;

    @Column(name = "type")
    private  String type;

    @Column(name = "status")
    private  int status;

    @Column(name = "blog_type")
    private  String blogType;

    public int getStatus() {
        return status;
    }

    public void setStatus(int status) {
        this.status = status;
    }

    public long getId() {
        return id;
    }

    public void setId(long id) {
        this.id = id;
    }

    public int getReadNum() {
        return readNum;
    }

    public void setReadNum(int readNum) {
        this.readNum = readNum;
    }

    public int getCommentNum() {
        return commentNum;
    }

    public void setCommentNum(int commentNum) {
        this.commentNum = commentNum;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getAddress() {
        return address;
    }

    public void setAddress(String address) {
        this.address = address;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getDesption() {
        return desption;
    }

    public void setDesption(String desption) {
        this.desption = desption;
    }

    public Date getTime() {
        return time;
    }

    public void setTime(Date time) {
        this.time = time;
    }

    public String getBlogType() {
        return blogType;
    }

    public void setBlogType(String blogType) {
        this.blogType = blogType;
    }
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章