Jsoup解析Html教程

獲取一個Document，這是Jsoup最核心的一個對象

有三種途徑來加載Document：字符串，URL地址，文件

/**
 * 
 */
package org.xdemo.example.jsoupdemo.input;
 
import java.io.File;
import java.io.IOException;
 
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
 
/**
 * @作者 Goofy
 * @郵件 [email protected]
 * @日期 2014-4-2上午10:54:53
 * @描述 
 */
public class ParseDocument {
     
    /**
     * 將String轉換成Document
     * @return org.jsoup.nodes.Document
     */
    public static Document parseHtmlFromString(){
        String html = "<html><head><title>標題</title></head>"
                + "<body><p>段落</p></body></html>";
        Document doc = Jsoup.parse(html);
        return doc;
    }
     
    /**
     * 注意：這是一個不安全的方法
     * 將String轉換成Html片段,注意防止跨站腳本攻擊
     * @return Element
     */
    public static Element parseHtmlFragmentFromStringNotSafe(){
        String html = "<div><p>Lorem ipsum.</p>";
        Document doc = Jsoup.parseBodyFragment(html);
        Element body = doc.body();
        return body;
    }
     
    /**
     * 這是一個安全的方法
     * 將String轉換成Html片段,注意防止跨站腳本攻擊
     * @return Element
     */
    public static Element parseHtmlFragmentFromStringSafe(){
        String html = "<div><p>Lorem ipsum.</p>";
        //白名單列表定義了哪些元素和屬性可以通過清潔器，其他的元素和屬性一律移除
        Whitelist wl=new Whitelist();
        //比較鬆散的過濾，包括
        //"a", "b", "blockquote", "br", "caption", "cite", "code", "col",
        //"colgroup", "dd", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6",
        //"i", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong",
        //"sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u",
        //"ul"
        Whitelist.relaxed();
        //沒有任何標籤，只有文本
        Whitelist.none();
        //常規的過濾器
        //"a", "b", "blockquote", "br", "cite", "code", "dd", "dl", "dt", "em",
        //"i", "li", "ol", "p", "pre", "q", "small", "strike", "strong", "sub",
        //"sup", "u", "ul"
        Whitelist.basic();
        //常規的過濾器，多了一個img標籤
        Whitelist.basicWithImages();
        //文本類型的標籤
        //"b", "em", "i", "strong", "u"
        Whitelist.simpleText();
        //另外還可以自定義過濾規則,例如
        wl.addTags("a");
        //執行過濾
        Jsoup.clean(html, wl);
        Document doc = Jsoup.parseBodyFragment(html);
        Element body = doc.body();
        return body;
    }
     
    /**
     * 從URL加載
     * @return Document
     */
    public static Document parseDocumentFromUrl(){
        Document doc = null;
        try {
            doc = Jsoup.connect("http://www.google.com/").get();
            //獲取標題
            String title = doc.title();
            System.out.println(title);//輸出：Google
            //data(key,value)是該URL要求的參數
            //userAgent制定用戶使用的代理類型
            //cookie帶上cookie，如cookie("JSESSIONID","FDE234242342342423432432")
            //連接超時時間
            //post或者get方法
            doc = Jsoup.connect("http://www.xxxxx.com/")
                      .data("query", "Java")
                      .userAgent("Mozilla")
                      .cookie("auth", "token")
                      .timeout(3000)
                      .post();
             
        } catch (IOException e) {
            e.printStackTrace();
        }
        return doc;
    }
    /**
     * 從文件加載
     * @return Document
     */
    public static Document parseDocumentFromFile(){
        File input = new File("/tmp/input.html");
        Document doc=null;
        try {
            //從文件加載Document文檔
            doc = Jsoup.parse(input, "UTF-8");
            System.out.println(doc.title());
        } catch (IOException e) {
            e.printStackTrace();
        }
        return doc;
    }
     
     
 
}

2.選擇器

package org.xdemo.example.jsoupdemo.extracter;
 
import java.util.regex.Pattern;
 
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
 
/**
 * @作者 Goofy
 * @郵件 [email protected]
 * @日期 2014-4-2上午10:41:19
 * @描述 選擇器 操作示例
 */
public class Selector {
 
public static void main(String[] args) {
Document doc;
try {
//獲取文檔
doc=Jsoup.connect("http://xxx.com/").get();
 
/*****獲取單一元素******/
//與JS類似的根據ID選擇的選擇器<div id="content"></div>
Element content = doc.getElementById("content");
 
/*****一下方法的返回值都是Elements集合******/
 
//獲取所有的a標籤<a href="#"></a>
content.getElementsByTag("a");
//類選擇器<div></div>
doc.getElementsByClass("divClass");
//獲取Document的所有元素
doc.getAllElements();
//根據屬性獲取元素<a href="#"></a>
doc.getElementsByAttribute("href");
//根據屬性前綴獲取元素 <li data-name="Peter Liu" data-city="ShangHai" data-lang="CSharp" data-food="apple">
doc.getElementsByAttributeStarting("data-");
//根據key-value選擇如<a href="http://xdemo.org"></a>
doc.getElementsByAttributeValue("href","http://xdemo.org");
//和上面的正好相反
doc.getElementsByAttributeValueNot("href","http://xdemo.org");
//根據key-value,其中value可能是key對應屬性的一個子字符串，選擇如<a href="http://xdemo.org"></a>
doc.getElementsByAttributeValueContaining("href", "xdemo");
//根據key-value,其中key對應值的結尾是value，選擇如<a href="http://xdemo.org"></a>
doc.getElementsByAttributeValueEnding("href", "org");
//和上面的正好相反
doc.getElementsByAttributeValueStarting("href","http://xdemo");
//正則匹配，value需要滿足正則表達式，<a href="http://xdemo.org"></a>,如href的值含有漢字
doc.getElementsByAttributeValueMatching("href",Pattern.compile("[\u4e00-\u9fa5]"));
//同上
doc.getElementsByAttributeValueMatching("href", "[\u4e00-\u9fa5]");
//根據元素所在的z-index獲取元素
doc.getElementsByIndexEquals(0);
//獲取z-index大於x的元素
doc.getElementsByIndexGreaterThan(0);
//和上面的正好相反
doc.getElementsByIndexLessThan(10);
 
//遍歷標籤
for (Element link : content.getElementsByTag("a")) {
 String linkHref = link.attr("href");
 String linkText = link.text();
}
 
/**************一些其他常用的方法**************/
//獲取網頁標題
doc.title();
//獲取頁面的所有文本
doc.text();
 
//爲元素添加一個css class
content.addClass("newClass");
//根據屬性獲取值
content.attr("id");
//獲取所有子元素
content.children();
//獲取元素內的所有文本
content.text();
//獲取同級元素
content.siblingElements();
 
 
} catch (Exception e) {
e.printStackTrace();
}
 
}
 
}