1.工具準備
2.瞭解並閱讀相關文檔
這裏不過多介紹,想要了解的可以自己去官網翻看,我們用httpclient模擬瀏覽器請求,用jsoup解析httpclient返回的實體的文檔對象。
3.開始編寫代碼
package com.ff.jsoup;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.Map;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupHelloWorld {
public Elements getRequestMethod(String url) throws Exception {
CloseableHttpClient client = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
//setConnectTimeout(10000)連接超時時間(單位豪秒)
//setSocketTimeout(10000)讀取超時時間(單位豪秒)
RequestConfig config=RequestConfig.custom().setConnectTimeout(10000).setSocketTimeout(10000).build();
httpGet.setConfig(config);
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64)"
+ " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
CloseableHttpResponse response = client.execute(httpGet);
HttpEntity entity = response.getEntity(); //獲取返回實體
String html = "";
if(entity!=null) {
html = EntityUtils.toString(entity);
}
response.close(); //關閉流和和釋放資源
Document doc = Jsoup.parse(html); //解析網頁得到文檔對象
/**
* getElementById(String id) 根據id來查詢DOM
* getElementsByTag(String tagName) 根據tag名稱來查詢DOM
* getElementsByClass(String className) 根據樣式名稱來查詢DOM
* getElementsByAttribute(String key) 根據屬性名來查詢DOM
* getElementsByAttributeValue(String key,String value) 根據屬性名和屬性值來查詢DOM
*/
Elements tags = doc.select("#ip_list > tbody > tr");
// Element element = tags.get(0);
// System.out.println(element.text());
// System.out.println(tags);
return tags;
}
public Map<Integer,String> getElementsByTags(Elements tags) {
Map<Integer,String> ipAddress = new HashMap<Integer,String>();
int count = 0;
//遍歷tbody子節點
for (Element element : tags) {
//取得ip地址節點
Elements tdChilds = element.select("tr > td:nth-child(2)");
//取得端口號節點
Elements tcpd = element.select("tr > td:nth-child(3)");
ipAddress.put(++count, tdChilds.text()+"."+tcpd.text());
}
System.out.println(ipAddress);
return ipAddress;
}
public void saveToText(Map<Integer,String> map) {
try {
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File("D:\\cc\\IpAddress.txt")),
"UTF-8"));
for (int i = 2; i < map.size(); i++) {
String ipAddess = map.get(i);
// FileUtils.writeStringToFile(new File("D:\\cc\\IpAddress.txt"), ipAddess, "utf-8");
bw.write(ipAddess);
bw.newLine();
}
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("ip地址爬取保存完畢");
}
public static void main(String[] args) {
JsoupHelloWorld ht = new JsoupHelloWorld();
String url = "http://www.xicidaili.com/nn/";
try {
Elements elements = ht.getRequestMethod(url);
Map<Integer, String> elementsByTags = ht.getElementsByTags(elements);
ht.saveToText(elementsByTags);
} catch (Exception e) {
e.printStackTrace();
}
}
}
4.流程分析
1.首先通過httpclient請求頁面返回實體
2.然後通過jsoup解析得到Document對象
3.分析西刺網站的ip地址和端口號所在位置
4.用select選擇器提取要爬取的ip地址和端口號
5.將ip地址和端口號拼接成一個完整的地址,再用Map集合封裝地址
6.遍歷Map,通過IO流的輸出流將Map封裝的地址輸出到本地文件,每寫入一行換行
7.關閉流,控制檯輸出結束語