path是專業的xml結構化文檔的查詢語言,語法功能強大,本文不涉及xpath語法教程。
jsoup 是一款Java 的HTML解析器,可直接解析某個URL地址、HTML文本內容。它提供了一套非常省力的API,可通過DOM,CSS以及類似於jQuery的操作方法來取出和操作數據,但是選取某個元素時還是沒有xpath那麼簡單直接,而且xpath帶了很多選擇庫。
然而遺憾的時,jsoup並不支持xpath,於是博主就寫了一個讓jsoup支持的xpath的工具類,希望能幫助到有需要的朋友!
工具類
package com.ry.mytools.util;
import com.sun.org.apache.xerces.internal.dom.ElementImpl;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.*;
import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/**
* * Jsoup的xpath解析工具類
* *
* * @author liuhh
* *
*
*/
@SuppressWarnings("restriction")
public class JsoupParserUtil {
protected final static DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
private final static Logger log = LoggerFactory.getLogger(JsoupParserUtil.class);
private final static XPath xPath = XPathFactory.newInstance().newXPath();
protected static TransformerFactory tf = TransformerFactory.newInstance();
private static final Lock LOCK = new ReentrantLock();
/**
* 得到該節點的子節點個數
*/
public static int getEleChildNum(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODESET);
if (null != res && res instanceof NodeList) {
NodeList nodeList = (NodeList) res;
return nodeList == null ? 0 : nodeList.getLength();
}
} catch (Exception e) {
log.error("根據xpath:{},獲取子節點個數出現錯誤,錯誤原因:" + e.getMessage(), xpath);
}
return 0;
}
/**
* 判斷文檔中是否存在xpath節點
*/
public static boolean exists(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.BOOLEAN);
if (null != res && res instanceof Boolean) {
return (boolean) res;
}
return false;
} catch (Exception e) {
log.error("檢查xpath:{},是否存在時出現錯誤,!" + e.getMessage(), xpath);
}
return false;
}
/**
* 根據xpath得到w3c的Element對象
*/
public static ElementImpl getW3cElementImpl(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODE);
if (null != res && res instanceof ElementImpl) {
return (ElementImpl) res;
}
return null;
} catch (Exception e) {
log.error("根據xpath:{},得到w3c的Element對象出現錯誤,原因:" + e.getMessage(), xpath);
}
return null;
}
/**
* 根據xpath得到jsoup的Element對象
*/
public static org.jsoup.nodes.Element getJsoupElement(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODE);
if (null != res && res instanceof ElementImpl) {
ElementImpl elementImpl = (ElementImpl) res;
return getJsoupEle(elementImpl);
}
return null;
} catch (Exception e) {
log.error("根據xpath:{},得到jsoup的Element對象出現錯誤,原因:" + e.getMessage(), xpath);
}
return null;
}
/**
* 根據xpath得到jsoup的Elements對象
*/
public static Elements getJsoupElements(final org.jsoup.nodes.Element ele, final String xpath) {
try {
NodeList nodeList = getNodeList(ele, xpath);
if (null != nodeList && nodeList.getLength() > 0) {
int len = nodeList.getLength();
Elements elements = new Elements();
for (int i = 0; i < len; i++) {
Node node = nodeList.item(i);
if (null != node && node instanceof ElementImpl) {
org.jsoup.nodes.Element
element = getJsoupEle(((ElementImpl) node));
elements.add(element);
}
}
return elements;
}
} catch (Exception e) {
log.error("根據xpath:{},得到jsoup的Element對象出現錯誤,原因:" + e.getMessage(), xpath);
}
return null;
}
/**
* 從Jsoup的Element中解析出W3C的NodeList
*/
public static NodeList getNodeList(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODESET);
if (null != res && res instanceof NodeList) {
return (NodeList) res;
}
} catch (Exception e) {
log.error(e.getMessage(), e);
}
return null;
}
/**
* 得到節點的某一個屬性
*/
public static String getXpathString(final org.jsoup.nodes.Element ele, final String xpath) {
try {
int textNum = getEleChildNum(ele, xpath);
if (1 == textNum) {
Object res = parse(ele, xpath, XPathConstants.STRING);
if (null != res) {
return res.toString();
}
} else {
List<String> res = getXpathListString(ele, xpath);
if (res != null && res.size() > 0) {
StringBuilder stringBuilder = new StringBuilder();
for (Iterator<String> iterator = res.iterator(); iterator.hasNext(); ) {
String text = iterator.next();
if (null != text) {
stringBuilder.append(text.replace("\r\n", "."));
}
}
return stringBuilder.toString();
}
}
return null;
} catch (Exception e) {
e.printStackTrace();
log.error("根據xpath:{}查詢字符串時出現錯誤:" + e.getMessage(), xpath);
}
return null;
}
/**
* 查詢字符串列表
*/
public static List<String> getXpathListString(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODESET);
if (null != res && res instanceof NodeList) {
NodeList nodeList = (NodeList) res;
int length = nodeList.getLength();
if (length <= 0) {
return null;
}
List<String> list = new ArrayList<>();
for (int i = 0; i < length; i++) {
Node node = nodeList.item(i);
list.add(null == node ? null : node.getNodeValue());
}
return list;
}
return null;
} catch (Exception e) {
log.error("根據xpath:{}查詢字符串列表時出現錯誤:" + e.getMessage(), xpath);
}
return null;
}
/**
* 獲取xpath解析結果
*/
public static Object parse(final org.jsoup.nodes.Element doc, final String xPathStr, final QName qName) {
Node node = fromJsoup(doc);
return parse(node, xPathStr, qName);
}
public static Object parse(final Node doc, final String xPathStr, final QName qName) {
try {
if (doc == null) {
log.warn("解析文檔爲null!");
return null;
}
if (StringUtils.isBlank(xPathStr)) {
log.warn("解析的Xpath路徑爲空!");
return null;
}
if (null == qName) {
log.warn("解析類型爲null!");
return null;
}
try {
LOCK.lock();
Object res = xPath.evaluate(xPathStr, doc, qName);
return res;
} finally {
// TODO: handle finally clause
LOCK.unlock();
}
} catch (Exception e) {
log.warn("解析Xpath:{},出現錯誤,解析類型:{},錯誤原因:{}!", xPathStr, qName, e.getMessage());
}
return null;
}
/**
* 根據ElementImpl得到Jsoup的Element
*/
public static org.jsoup.nodes.Element getJsoupEle(final ElementImpl elementImpl) {
try {
String value = getW3cDocString(elementImpl);
org.jsoup.nodes.Document document = Jsoup.parse(value);
return document.body().child(0);
} catch (Exception e) {
// TODO: handle exception
log.error("根據ElementImpl得到Jsoup的Element出現錯誤,錯誤原因:" + e.getMessage());
return null;
}
}
/**
* 將w3c的Document轉爲jsoup的Document
*/
public static org.jsoup.nodes.Document fromW3C(final Document doc) throws Exception {
String string = getW3cDocString(doc);
org.jsoup.nodes.Document res = Jsoup.parse(string);
return res;
}
/**
* 將jsoup的Document轉爲w3c的Document
*/
public static Node fromJsoup(final org.jsoup.nodes.Element in) {
DocumentBuilder builder;
try {
if (null == in) {
return null;
}
builder = factory.newDocumentBuilder();
Document out = builder.newDocument();
if (in instanceof org.jsoup.nodes.Document) {
List<org.jsoup.nodes.Node> childs = in.childNodes();
if (childs != null && childs.size() > 0) {
org.jsoup.nodes.Element rootEl = in.child(0);
NodeTraversor
traversor = new NodeTraversor(new W3CBuilder(out));
traversor.traverse(rootEl);
return out;
} else {
// out.setNodeValue(in.);
return out;
}
} else if (in instanceof org.jsoup.nodes.Element) {
NodeTraversor
traversor = new NodeTraversor(new W3CBuilder(out));
traversor.traverse(in);
return out;
}
} catch (ParserConfigurationException e) {
return null;
}
return null;
}
/**
* 將W3c的doc轉爲字符串
*/
public static String getW3cDocString(final Node doc) throws Exception {
try (StringWriter writer = new StringWriter()) {
DOMSource domSource = new DOMSource(doc);
StreamResult result = new StreamResult(writer);
LOCK.lock();
try {
Transformer transformer = tf.newTransformer();
transformer.transform(domSource, result);
return writer.toString();
} finally {
LOCK.unlock();
}
} catch (TransformerException e) {
throw new IllegalStateException(e);
}
}
/**
* 將Jsoup的node屬性拷貝到w3c的Element中
*/
public static void copyAttributes(final org.jsoup.nodes.Node source, final Element el) {
for (Attribute attribute : source.attributes()) {
el.setAttribute(attribute.getKey(), attribute.getValue());
}
}
}
class W3CBuilder implements NodeVisitor {
private final Document doc;
private Element dest;
public W3CBuilder(Document doc) {
this.doc = doc;
}
@Override
public void head(final org.jsoup.nodes.Node source, int depth) {
if (source instanceof org.jsoup.nodes.Element) {
org.jsoup.nodes.Element
sourceEl = (org.jsoup.nodes.Element) source;
Element el = doc.createElement(sourceEl.tagName());
JsoupParserUtil.copyAttributes(sourceEl, el);
if (dest == null) {
doc.appendChild(el);
} else {
dest.appendChild(el);
}
dest = el;
} else if (source instanceof org.jsoup.nodes.TextNode) {
org.jsoup.nodes.TextNode
sourceText = (org.jsoup.nodes.TextNode) source;
Text text = doc.createTextNode(sourceText.getWholeText());
dest.appendChild(text);
} else if (source instanceof org.jsoup.nodes.Comment) {
org.jsoup.nodes.Comment
sourceComment = (org.jsoup.nodes.Comment) source;
Comment comment = doc.createComment(sourceComment.getData());
dest.appendChild(comment);
} else if (source instanceof org.jsoup.nodes.DataNode) {
org.jsoup.nodes.DataNode
sourceData = (org.jsoup.nodes.DataNode) source;
Text node = doc.createTextNode(sourceData.getWholeData());
dest.appendChild(node);
} else {
}
}
@Override
public void tail(final org.jsoup.nodes.Node source, int depth) {
if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) {
dest = (Element) dest.getParentNode();
}
}
}
測試
import java.io.IOException;
import java.net.URL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class JsoupParserUtilsTest {
public static void main(String[] args) throws Exception, IOException {
String url = "http://mil.news.sina.com.cn/china/2016-09-29/doc-ifxwmamy9955666.shtml";
Document doc = Jsoup.parse(new URL(url), 10000);
String titleXpath = "//*[@id='main_title']/text()";
String timeXpath = "//*[@id='page-tools']/span/span[position() = 1]";
System.out.println(JsoupParserUtils.exists(doc, "/html/body/div[position>1000000]"));
System.out.println(JsoupParserUtils.getXpathString(doc, titleXpath));
Element element = JsoupParserUtils.getJsoupElement(doc, timeXpath);
System.out.println(element.text());
System.out.println(element.attr("class"));
}
}
————————————————
讓你的Jsoup支持Xpath