本文出自:http://blog.csdn.net/dt235201314/article/details/79003591
一丶效果圖
二丶概述
jsoup 是一款Java 的HTML解析器,可直接解析某個URL地址、HTML文本內容。它提供了一套非常省力的API,可通過DOM,CSS以及類似於jQuery的操作方法來取出和操作數據。
jsoup的主要功能如下:
1. 從一個URL,文件或字符串中解析HTML;
2. 使用DOM或CSS選擇器來查找、取出數據;
3. 可操作HTML元素、屬性、文本;
三丶看代碼
1.選取要抓取的網絡地址 例:我的博客 http://blog.csdn.net/dt235201314
2.選取要抓取的類容
博主信息類:
public class BlogAuthor { //作者名字 private String authorName; //訪問數量 private String visitNumber; //積分 private String mark; //排名 private String rank; //原創文章數量 private String originalArticleNumber; //轉載文章數量 private String reprintArticleNumber; //翻譯文章數量 private String translateArticleNumber; //評論數量 private String commentNumber; //頭像鏈接 private String avatarUrl; //我的代號 private String code; //我的名言 private String myHelloWorld;頁面文章摘要類:
public class BlogIntroduction { //文章標題 private String title; //文章摘要 private String description; //文章信息,包括閱讀量,評論數,發表時間等 private String msg; //文章分類 private String category; //文章鏈接 private String url;3.相關操作(谷歌瀏覽器)
這個時候瀏覽器會出現源碼,且鎖定到你選取的位置
其它位置一樣
、
4.代碼解析
添加依賴:
compile 'org.jsoup:jsoup:1.9.2'
/** * 獲取博主的基本信息 * * @return 博主信息 */ public static BlogAuthor getBlogAutoMessage() { Document doc; BlogAuthor blogAuthor = null; Elements elements; /**作者名字*/ String authorName; /** 訪問數量*/ String visitNumber; /** 積分*/ String mark; /** 排名*/ String rank; /** 原創文章數量*/ String originalArticleNumber; /** 轉載文章數量*/ String reprintArticleNumber; /** 翻譯文章數量*/ String translateArticleNumber; /** 評論數量*/ String commentNumber; /** 頭像鏈接*/ String avatarUrl; /**我的代號*/ String code; /**我的名言*/ String myHelloWorld; try { doc = Jsoup.connect(BLOG_HOMEPAGE) .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31") //"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31" //"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0" .timeout(10000).get(); if (doc != null) { elements = doc.select("div#blog_title").select("h2").select("a"); code = elements.first().text(); elements = doc.select("div#blog_title").select("h3"); myHelloWorld = elements.first().text(); elements = doc.select("div#blog_userface").select("a.user_name"); authorName = elements.first().text(); elements = doc.select("div#blog_userface").select("a").select("img"); avatarUrl = elements.first().attr("src"); elements = doc.select("ul#blog_rank").select("li"); visitNumber = elements.get(0).text(); mark = elements.get(1).text(); rank = elements.get(3).text(); elements = doc.select("ul#blog_statistics").select("li"); originalArticleNumber = elements.get(0).text(); reprintArticleNumber = elements.get(1).text(); translateArticleNumber = elements.get(2).text(); commentNumber = elements.get(3).text(); blogAuthor = new BlogAuthor(code,myHelloWorld,authorName, visitNumber, mark, rank, originalArticleNumber, reprintArticleNumber, translateArticleNumber, commentNumber, avatarUrl); } } catch (Exception e) { e.printStackTrace(); blogAuthor = new BlogAuthor("","","", "訪問:0", "積分:0", "積分:0", "原創:0", "轉載:0", "譯文:0", "評論:0", ""); } return blogAuthor; }
/** * 按時間排列,獲取指定頁的博客簡介 * * @param pages 頁數 * @return 簡介 */ public static List<BlogIntroduction> getOnePageBlogIntroductionByTime(int pages) { if (pages < 1) { return null; } int totalPages = getBlogPages(); if (pages > totalPages) { return null; } Document doc; Elements blogList; List<BlogIntroduction> blogIntroductionList = null; BlogIntroduction blogIntroduction; try { doc = Jsoup.connect(BLOG_HOMEPAGE + "/article/list/" + pages) .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31") .timeout(10000).get(); blogList = doc.select("div#article_list > div"); if (blogList != null) { blogIntroductionList = new ArrayList<>(); for (Element blogItem : blogList) { blogIntroduction = new BlogIntroduction(); String title = blogItem.select("div.article_title > h1").text(); String description = blogItem.select("div.article_description").text(); String msg = blogItem.select("div.article_manage").text(); String link = BASE_PATH + blogItem.select("div.article_title > h1").select("span.link_title") .select("a").attr("href"); blogIntroduction.setTitle(title); blogIntroduction.setDescription(description); blogIntroduction.setMsg(msg); blogIntroduction.setUrl(link); blogIntroduction.setCategory(""); blogIntroductionList.add(blogIntroduction); } } } catch (IOException e1) { e1.printStackTrace(); } return blogIntroductionList; }測試類
public class JsoupUtilTest { public static void main(String[] args) { BlogAuthor blogAuthor = JsoupUtil.getBlogAutoMessage(); List<BlogIntroduction> blogs = JsoupUtil.getOnePageBlogIntroductionByTime(1); System.out.println("==-->"+ blogAuthor.getAuthorName()); System.out.println("==-->"+blogAuthor.getCommentNumber()); System.out.println("==-->"+ blogAuthor.getAvatarUrl()); System.out.println("==-->"+ blogAuthor.getVisitNumber()); System.out.println("==-->"+blogAuthor.getRank()); System.out.println("==-->"+blogAuthor.getMark()); System.out.println("==-->"+blogAuthor.getOriginalArticleNumber()); System.out.println("==-->"+blogAuthor.getReprintArticleNumber()); System.out.println("==-->"+blogAuthor.getTranslateArticleNumber()); System.out.println("==-->"+blogAuthor.getCode()); System.out.println("==-->"+blogAuthor.getMyHelloWorld()); for(BlogIntroduction blog:blogs){ System.out.println("==-->"+blog.getTitle()); System.out.println("==-->"+blog.getDescription()); } } }輸出見效果圖
參考內容:
Android 個人博客客戶端——My CSDN 的實現(2)
四丶源碼下載
如果文章對你有用歡迎star,歡迎關注