最近很長一段時間都在做信息採集。現如今把所用到的知識系統的整理一下,給想做採集系統的同志們做個參考資料
1 需要 的jar包 如下 httpclient.jar 和spring mvc所需要的jar以及jsoup所用到的jar包 流程圖如下
1.httmclient的用法如下
/**
*
* @param URL 站點地址
* @param coding 設置頁面編碼
* @return value 返回地址頁面的值
* @author ZF
* @throws IOException
*/
public class TestHttpClient {
@SuppressWarnings("deprecation")
public static String SimpleConnection(String URL,String coding,XmlConfigBean xmlbean){
String value="";
DefaultHttpClient httpclient =new DefaultHttpClient();
//httpclient.getParams().setBooleanParameter(CookiePolicy.BROWSER_COMPATIBILITY,true);
// HttpClientParams.setCookiePolicy(httpclient.getParams(), CookiePolicy.BROWSER_COMPATIBILITY);
try {
HttpGet HttpGet = new HttpGet(URL);
RequestConfig defaultRequestConfig = RequestConfig.custom()
.setSocketTimeout(10000)
.setConnectTimeout(10000)
.setConnectionRequestTimeout(10000)
.build();
HttpGet.setConfig(defaultRequestConfig);
CloseableHttpResponse response = httpclient.execute(HttpGet);
httpclient.setHttpRequestRetryHandler(requestRetryHandler);
try {
if(response.getStatusLine().getStatusCode()==200){
HttpEntity resEntity = response.getEntity();
Header header = response.getFirstHeader("Content-Type");
//coding= getRealCode(header,coding);
if (resEntity.getContentEncoding() != null && "gzip".equals(resEntity.getContentEncoding().getValue()))
resEntity= new GzipDecompressingEntity(resEntity);
if (resEntity != null)
value=EntityUtils.toString(resEntity, coding);
}
} finally {
response.close();
}
}catch(IOException e){
// ApplicationContext ac = new ClassPathXmlApplicationContext("applicationContext.xml");
// TestService testService = (TestService)ac.getBean("testService");
String error = "SimpleHttpClient.SimpleConnection():"+URL+" >> "+e.getMessage();
// testService.saveError(xmlbean, URL, error, "", "0");
System.out.println(error);
} finally {
httpclient.close();
}
return value;
}
}
2 獲取到請求的內容後就可以對請求的內容進行解析 提取需要的數據過濾掉不需要的內容,對靜態頁面的處理有兩種框架可用一個是htmlparser和jsoup我選擇使用後者對頁面的處理,jsoup的語法有點類似於jquery的用法我相信只要你會用jquery那麼你使用jsoup就很容易上手。
1 在src下新建兩個html文件 分別爲 test_table.html,test_attribute.htm
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<title>jsoup解析table的用法</title>
</head>
<body>
<table>
<tr>
<td>jsoup解析1表格的第一行的第一個表格</td>
<td>jsoup解析1表格的第一行的第二個表格</td>
</tr>
</table>
<table>
<tr>
<td>jsoup解析2表格的第一行的第一個表格</td>
<td>jsoup解析2表格的第一行的第二個表格</td>
</tr>
</table>
</body>
</html>
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<title>jsoup解析標籤獲取屬性值和修改屬性值的用法的用法</title>
</head>
<body>
<div class="第一個div的class屬性" id="第一個div的id屬性" ></div>
<div class="第二個div的class屬性" id="第二個div的id屬性" ></div>
<img real_str="/img/bobo.jpg"; src="" />
</body>
</html>
3 解析的jsoup解析html的代碼如下
package cn.tsou.connector;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.junit.Test;
public class TestJsoup {
public String test_Table = "src/test_Table.html";
public String test_attribute ="src/test_attribute.html";
//jsoup解析表格的用法
@Test
public void TestTable() throws IOException {
String table_Html = readFileName(test_Table, "gbk");
//定義一個選擇器 變量select_str 該變量作用是 提取所用表格的td內容
String select_str="table tr";
Document document = Jsoup.parse(table_Html);
Elements elements = document.select(select_str);
for(int i=0;i<elements.size();i++){
System.out.println("獲取html內容"+elements.get(i).html());
System.out.println("獲取純文本不含有標籤"+elements.get(i).text());
}
}
//解析attribute中值
@Test
public void Test_Attribute() throws IOException{
String attribute_html= readFileName(test_attribute, "gbk");
org.jsoup.nodes.Document doc = Jsoup.parse(attribute_html);
Elements divElement = doc.getElementsByTag("div");
for(int i=0;i<divElement.size();i++){
System.out.println("div的class屬性值:\t "+divElement.get(i).attr("class"));
System.out.println("div的 id 屬性值:\t "+divElement.get(i).attr("id"));
}
}
//解析attribute中值並做修改
@Test
public void Test_modifiedAttribute() throws IOException{
String attribute_html= readFileName(test_attribute, "gbk");
Document doc = Jsoup.parse(attribute_html);
Elements imgs = doc.select("img");
for(int i=0;i<imgs.size();i++){
System.out.println(imgs.get(i).attr("real_str"));
//將real_str屬性賦值給src屬性
imgs.get(i).attr("src",imgs.get(i).attr("real_str"));
//移除remal_str屬性
imgs.get(i).removeAttr("real_str");
}
System.out.println("img修改後的屬性爲:"+doc.html());
}
public String readFileName(String fileName, String coding)
throws IOException {
StringBuffer sbf = null;
String line = null;
BufferedReader buf = null;
try {
buf = new BufferedReader(new InputStreamReader(new FileInputStream(
fileName), coding));
sbf = new StringBuffer();
while ((line = buf.readLine()) != null) {
sbf.append(line + "\n");
}
} finally {
if (buf != null)
buf.close();
}
return sbf.toString();
}
}