關注公衆號【肥超說技術】,回覆【資源】,可獲取200G+最新微服務、docker技術資源等。
1.webmagic基本使用。
詳情不在贅述,具體請看開發者給出的開發文檔,我在此處使用的是 webmagic-selenium,因爲往往我們爬取的頁面是動態的,有時候甚至伴隨着點擊事件,若靜態界面則不需要
2. webgmic配置。
- maven依賴。
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<version>0.7.3</version>
</dependency>
2.去gitee上下載webmagic源碼進行修改部分代碼(若用不到webmagic-selenium,不用修改)。
修改模塊文件webmagic-selenium下的WebDriverPool.java文件,修改完成打包源文件,重新刷新依賴。
// 1.修改配置文件地址
private static final String DEFAULT_CONFIG_FILE = "selenium.properties";
//2.修改打開方式(原理就是java代碼打開一個瀏覽器監聽,若覺得很煩此處可進行關閉打開瀏覽器)
if (isUrl(driver)) {
sCaps.setBrowserName("phantomjs");
mDriver = new RemoteWebDriver(new URL(driver), sCaps);
} else if (driver.equals(DRIVER_FIREFOX)) {
mDriver = new FirefoxDriver(sCaps);
} else if (driver.equals(DRIVER_CHROME)) {
ChromeOptions options = new ChromeOptions();
options.setHeadless(true);
options.addArguments("-headless");
mDriver = new ChromeDriver(options);
} else if (driver.equals(DRIVER_PHANTOMJS)) {
mDriver = new PhantomJSDriver(sCaps);
}
3.webmagic的使用
去下載自己瀏覽器對應的版本驅動
1.使用webmagic進行爬取噹噹網的商品分類,不需要使用selenium。
import com.alibaba.fastjson.JSONObject;
import com.magic.demo.ConsolePipeline;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
public class DangDangProcessor implements PageProcessor {
private Site site = Site
.me()
.setSleepTime(3000)//設置超時時間,單位是毫秒
.setUserAgent( //設置UserAgent
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36");
@Override
public void process(Page page) {
Selectable selectable = page.getHtml().xpath("//div[@class=\"classify_books\"]");
List<Selectable> nodes = selectable.nodes();
for(Selectable selectableNode : nodes){
//一級標題
String oneTitle = selectableNode.$("h3[class^=\"classify_title\"] > a", "text").toString();
System.out.println("======"+oneTitle);
Selectable selectable2 = selectableNode.xpath("//div[@class=\"classify_kind\"]");
for(Selectable selectable2C :selectable2.nodes()){
//二級標題
String twoTitle = selectable2C.xpath("/div/div/a/text()").toString();
System.out.println("============"+twoTitle);
//三級標題
List<String> threeTitle = selectable2C.xpath("//li[@name=\"cat_3\"]/a/text()").all();
System.out.println("===================================="+JSONObject.toJSONString(threeTitle));
}
selectableNode.$("a");
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new DangDangProcessor())
//關於Pipeline的使用 http://webmagic.io/docs/zh/posts/ch6-custom-componenet/pipeline.html
.addUrl("http://category.dangdang.com/?ref=www-0-C")
.addPipeline(new ConsolePipeline())
.run();
}
}
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.List;
import java.util.Map;
public class ConsolePipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
//獲取訪問的Url
System.out.println("url:"+resultItems.getRequest().getUrl());
//關於Pipeline的使用 http://webmagic.io/docs/zh/posts/ch6-custom-componenet/pipeline.html
}
}
效果如下
2. 當我們需要一些需要登錄才能爬蟲的網站,獲取動態網頁進入網頁需要單擊某個菜單以後獲取到需要的頁面信息,使用webmagic-selenium。
import com.alibaba.fastjson.JSONObject;
import com.magic.demo.ConsolePipeline;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.Properties;
import java.util.Set;
public class JcoolLoginProcessor implements PageProcessor {
private Set<Cookie> cookies;
private Site site = Site
.me()
.setSleepTime(3000)
// .setCycleRetryTimes(5)失敗則會重試
.setUserAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36");
@Override
public void process(Page page) {
}
void getCookie(){
//加載驅動
System.setProperty("webdriver.chrome.driver",
"C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\75.0.3770.100\\chromedriver_win32\\chromedriver.exe");
WebDriver driver = new ChromeDriver();
driver.manage().window().maximize();
//打開地址
driver.get("登錄url");
//獲取用戶名 密碼的標籤
driver.findElement(By.xpath("//input[@name='loginName']")).sendKeys("admin");
driver.findElement(By.xpath("//input[@name='loginPass']")).sendKeys("123456");
//獲取登錄按鈕
WebElement element = driver.findElement(By.xpath("//button[@class='el-button loadBtn el-button--primary']"));
//單擊登錄
element.click();
//獲取返回cookie
cookies = driver.manage().getCookies();
driver.close();
}
@Override
public Site getSite() {
//給site增加請求頭 注意需設置域名後,addCookie纔可生效 site.setDomain()
for (Cookie cookie : cookies) {
site.addCookie(cookie.getName().toString(), cookie.getValue().toString());
}
return site;
}
public static void main(String[] args) {
JcoolLoginProcessor jcoolLoginProcessor = new JcoolLoginProcessor();
jcoolLoginProcessor.getCookie();
Spider.create(jcoolLoginProcessor)
.addUrl("要訪問的需要登錄url")
.addPipeline(new ConsolePipeline())
.setDownloader(new SeleniumDownloader("C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\75.0.3770.100\\chromedriver_win32\\chromedriver.exe"))
.run();
}
}