一、案例背景
這裏爲了簡化操作,我們以爬取 http://www.fzdm.com/ 網頁的熱門漫畫爲例。
二、對比
SeimiCrawler爬蟲框架 爬取速度較快,但是不穩定(表現在線程一多,易崩潰);selenium自動化測試工具 爬取速度略慢,但是穩定。
三、方式一:SeimiCrawler爬蟲框架
(一)添加依賴
<!-- SeimiCrawler 開源爬蟲框架 -->
<dependency>
<groupId>cn.wanghaomiao</groupId>
<artifactId>SeimiCrawler</artifactId>
<version>2.0</version>
</dependency>
(二)爬取邏輯
特別注意:
push()方法中調用的後續爬取邏輯,比如chapterBean(),其訪問權限一定要是 public,否則什麼都爬取不到!!!!
package org.pc.demo;
import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.def.BaseSeimiCrawler;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.struct.Response;
import org.jsoup.nodes.Element;
import org.seimicrawler.xpath.JXDocument;
import org.springframework.util.CollectionUtils;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author 鹹魚
* @date 2018/12/26 21:12
*/
@Crawler(name = "my-crawler", httpTimeOut = 30000)
public class MyCrawler extends BaseSeimiCrawler {
@Override
public String[] startUrls() {
return new String[]{"http://www.fzdm.com/"};
}
@Override
public void start(Response response) {
JXDocument document = response.document();
List<Object> links = document.sel("//div[@id='box1']/li/a");
if (isEmpty(links)) {
return;
}
for (Object link : links) {
Element element = (Element) link;
String comicName = element.childNode(0).toString();
String comicUrl = "http:" + element.attr("href");
Map<String, String> params = new HashMap<>();
params.put("comicName", comicName);
//繼續訪問漫畫章節信息
// push(Request.build(comicUrl, MyCrawler::chapterBean).setParams(params));
}
}
//下面是後續邏輯,這裏案例只用到上面內容
public void chapterBean(Response response) {
String requestUrl = response.getUrl();
String comicName = response.getRequest().getParams().get("comicName");
logger.info("漫畫名:" + comicName);
JXDocument document = response.document();
List<Object> links = document.sel("//div[@id='content']/li/a");
if (isEmpty(links)) {
return;
}
for (Object link : links) {
Element element = (Element) link;
String chapterName = element.childNode(0).toString();
String chapterUrl = requestUrl + element.attr("href");
logger.info("漫畫地址:" + chapterUrl);
Pattern chapterNumberPattern = Pattern.compile("^" + comicName + "\\s*(\\d+)\\S*");
Matcher matcher = chapterNumberPattern.matcher(chapterName);
if (matcher.find()){
//取匹配的第一個()內的內容
logger.info(comicName + "第" + matcher.group(1) + "話");
}
// push(Request.build(chapterUrl, MyCrawler::contentBean)
// //使用SeimiAgent,預加載 js
// .useSeimiAgent()
// //渲染時間
// .setSeimiAgentRenderTime(6000)
// );
}
}
public void contentBean(Response response) {}
private boolean isEmpty(List<Object> links) {
if (CollectionUtils.isEmpty(links)){
logger.info("什麼都沒取到,是不是 xpath 寫錯了?");
return true;
}
return false;
}
}
(三)配置爬蟲
application.properties
:
#開啓爬蟲
seimi.crawler.enabled=true
#指定爬蟲
seimi.crawler.names=my-crawler
#配置 seimiagent IP
seimi.crawler.seimi-agent-host=192.168.10.133
#配置 seimiagent 端口
seimi.crawler.seimi-agent-port=8000
三、方式二:selenium自動化測試工具
(一)準備好Chrome瀏覽器和Chrome驅動
(二)添加依賴
<!-- selenium-java客戶端段 -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<!-- selenium-chrome驅動 -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-chrome-driver</artifactId>
<version>3.141.59</version>
</dependency>
<!-- selenium-server -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-server</artifactId>
<version>3.141.59</version>
</dependency>
(三)爬取邏輯
package org.pc.demo;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.springframework.util.CollectionUtils;
import java.util.List;
/**
* 爬去熱門漫畫
* @author 鹹魚
* @date 2018/12/27 19:02
*/
public class MySelenium {
public static void main(String[] args) {
crawl();
}
private static void crawl(){
//設置Chrome驅動
System.getProperties().setProperty("webdriver.chrome.driver", "E:\\demo\\crawler\\chromedriver.exe");
//實例化Chrome驅動
WebDriver webDriver = new ChromeDriver();
//爬取網址
webDriver.get("http://www.fzdm.com/");
//獲取指定DOM元素
List<WebElement> elements = webDriver.findElements(By.xpath("//div[@id='box1']/li/a"));
if (CollectionUtils.isEmpty(elements)){
return;
}
for (WebElement element : elements) {
System.out.println(element.getText());
}
}
}