需求:爬取項目中指定url頁面,獲取js執行後的html頁面;
調整樣式,將此html頁面(需支持highchart圖表,表格,表單等),通過郵件方式發送給用戶查看.
測試1:基於java嵌套瀏覽器:JBrowserDriver
package test.mail.demo;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import com.machinepublishers.jbrowserdriver.JBrowserDriver;
import com.machinepublishers.jbrowserdriver.Settings;
import com.machinepublishers.jbrowserdriver.Timezone;
public class JBrowserDemoTest {
/*
* 採納原因1:滿足需求;
* 採納原因2:代碼量少;java嵌套瀏覽器,項目移植方便;
*
*/
public static void main(String[] args) throws IOException {
String loginUrl="https://panjiachen.gitee.io/vue-element-admin/#/login?redirect=%2Fdashboard";
String url="https://panjiachen.gitee.io/vue-element-admin/#/permission/role";
String htmlStr = getUrlHtml(loginUrl,url);
System.out.println("html"+htmlStr);
//將解析url的html內容文件形式寫到當前項目下,方便瀏覽器打開對比頁面爬取效果
FileUtils.writeStringToFile(new File("url內容.html"), htmlStr,"UTF-8");
}
//方案1,獲取js執行後的頁面:JBrowserDriver
public static String getUrlHtml(String loginUrl,String url) {
JBrowserDriver webDriver = new JBrowserDriver(Settings.builder().
timezone(Timezone.AMERICA_NEWYORK).build());
//模擬登入
//方法1: webDriver.get(null); webDriver.manage().addCookie(cookie);
//方法2:
webDriver.get(loginUrl);
WebElement username = webDriver.findElement(By.xpath("//*[@name=\"username\"]"));
WebElement pwd = webDriver.findElement(By.xpath("//*[@name=\"password\"]"));
username.clear();
username.sendKeys("admin");
pwd.clear();
pwd.sendKeys("11111111");
webDriver.findElement(By.xpath("//*[@class=\"el-button el-button--primary el-button--medium\"]")).click();
//此時已經攜帶cookie,可訪問授權頁面
webDriver.get(url);
//輸出js執行後的html
String htmlStr = webDriver.getPageSource();
webDriver.quit();
return htmlStr;
}
//參考地址:
// 基於WebKit 無圖形化瀏覽器 jBrowserDriver:https://www.oschina.net/p/jbrowserdriver
// API文檔:http://machinepublishers.github.io/jBrowserDriver/
// jBrowserDriver是一個純Java編寫的可編程、嵌入式的,瀏覽器驅動,基於WebKit:https://java.ctolib.com/jbrowserdriver.html
/*pom依賴,(我的項目是局域網,在聯網電腦,新建項目把依賴jar全部拷貝)
<dependency>
<groupId>com.machinepublishers</groupId>
<artifactId>jbrowserdriver</artifactId>
<version>1.1.1</version>
</dependency>
*/
}
測試2.基於谷歌無頭瀏覽器:selenium+chromedriver+chrome headless(谷歌無頭瀏覽器,windows谷歌瀏覽器就行)
package test.mail.demo;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import org.apache.commons.io.FileUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
public class ChromeHeadlessDemoTest {
/*
* 不採用原因1:我的項目環境是Linux環境,局域網,無法聯網下載谷歌無頭瀏覽器;本地環境依賴太多;
* 不採用原因2:需求只需要爬取指定url頁面,不需要進一步爬取頁面中的鏈接,此選擇過於笨重;換環境麻煩;
* chromedriver_linux64
* google-chrome-stable_current_x86_64.rpm
* https://dl.google.com/linux/direct/google-chrome-stable_current_x86_64.rpm
*/
public static void main(String[] args) throws IOException {
String loginUrl="https://panjiachen.gitee.io/vue-element-admin/#/login?redirect=%2Fdashboard";
String url="https://panjiachen.gitee.io/vue-element-admin/#/permission/role";
String htmlStr = getUrlHtml(loginUrl,url);
System.out.println("html"+htmlStr);
//將解析url的html內容文件形式寫到當前項目下,方便瀏覽器打開對比頁面爬取效果
FileUtils.writeStringToFile(new File("url內容.html"), htmlStr,"UTF-8");
}
//方案2,獲取js執行後的頁面:selenium+chromedriver+chrome headless(谷歌無頭瀏覽器,windows谷歌瀏覽器就行)
public static String getUrlHtml(String loginUrl,String url) {
//準備工作:下載你使用的谷歌瀏覽器對應版本的chromedriver;地址 https://npm.taobao.org/mirrors/chromedriver/
//前提:將chromedriver。exe放到你的谷歌瀏覽器安裝目錄(與chrome.exe平級)
System.setProperty("webdriver.chrome.driver","C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe");
WebDriver webDriver = new ChromeDriver();
//模擬登入
//方法1: webDriver.get(null); webDriver.manage().addCookie(cookie);
//方法2:
webDriver.get(loginUrl);
WebElement username = webDriver.findElement(By.xpath("//*[@name=\"username\"]"));
WebElement pwd = webDriver.findElement(By.xpath("//*[@name=\"password\"]"));
username.clear();
username.sendKeys("admin");
pwd.clear();
pwd.sendKeys("11111111");
webDriver.findElement(By.xpath("//*[@class=\"el-button el-button--primary el-button--medium\"]")).click();
//此時已經攜帶cookie,可訪問授權頁面
webDriver.get(url);
//等待1s;
try {
TimeUnit.SECONDS.sleep(1);
} catch (InterruptedException e) {
e.printStackTrace();
}
//輸出js執行後的html
String htmlStr = webDriver.getPageSource();
webDriver.quit();
return htmlStr;
}
//參考地址:
// java 利用selenium+chromedriver實現爬蟲:https://www.jianshu.com/p/30b60f5da23c
//selenium+Chromedriver模擬登陸操作: https://www.jianshu.com/p/308daa2b91c2
/*
*pom依賴
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
*/
//linux相關,依賴太多,最終放棄離線安裝
//linux局域網安裝chrome缺少依賴libappindicator3.so.1及liberation-fonts:https://blog.csdn.net/wangying202/article/details/102565367
//linuxrpm包下載地址:http://rpmfind.net/
// http://rpmfind.net/linux/centos/7.7.1908/os/x86_64/Packages/liberation-fonts-1.07.2-16.el7.noarch.rpm
}
測試3:java嵌套瀏覽器:htmlunit
package test.mail.demo;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class HtmlUnitDemoTest {
/*
* 不採用原因1:爬取highchart圖失敗(svg中with,height:NaN),
* 不採用原因2:爬取我們前臺vue項目頁面的表單,樣式margin-left生成不對;
*
*/
public static void main(String[] args) throws IOException {
String loginUrl="https://panjiachen.gitee.io/vue-element-admin/#/login?redirect=%2Fdashboard";
String url="https://panjiachen.gitee.io/vue-element-admin/#/permission/role";
String htmlStr = getUrlHtml(loginUrl,url);
System.out.println("html"+htmlStr);
//將解析url的html內容文件形式寫到當前項目下,方便瀏覽器打開對比頁面爬取效果
FileUtils.writeStringToFile(new File("url內容.html"), htmlStr,"UTF-8");
}
//方案3,獲取js執行後的頁面:htmlunit
public static String getUrlHtml(String loginUrl,String url) throws IOException {
WebClient client=new WebClient(BrowserVersion.CHROME);//設置瀏覽器內核
client.getCookieManager().setCookiesEnabled(true);//設置cookie是否可用
client.getOptions().setJavaScriptEnabled(true); // js是否可用
client.getOptions().setCssEnabled(true); //css ,vue項目需設置爲true,jq項目一般設置false因爲影響運行速度
client.getOptions().setThrowExceptionOnScriptError(false);
client.getOptions().setThrowExceptionOnFailingStatusCode(false);
client.setAjaxController(new NicelyResynchronizingAjaxController()); // ajax設置
//沒有寫登入
HtmlPage page = client.getPage(url);
String htmlStr=page.asXml();
//關閉
client.close();
return htmlStr;
}
//參考地址:
// 使用htmlunit工具來實現對新浪的模擬登錄獲取cookie操作:https://blog.csdn.net/m0_37300802/article/details/79046034
//
/*
* pom依賴
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.38.0</version>
</dependency>
*/
}
測試4:PhantomJS
package test.mail.demo;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
public class PhantomJSDemoTest {
/*
* 不採用原因1:爬取highchart圖失敗(svg中with,height:NaN),爬取我們前臺vue項目失敗率高;
* 不採用原因2:我的項目環境是Linux環境,局域網,無法聯網下載linux版PhantomJS;本地環境依賴太多;
* 不採用原因3:需求只需要爬取指定url頁面,不需要進一步爬取頁面中的鏈接,此選擇過於笨重;換環境麻煩;
* phantomjs.exe
*/
public static void main(String[] args) throws IOException {
String loginUrl="https://panjiachen.gitee.io/vue-element-admin/#/login?redirect=%2Fdashboard";
String url="https://panjiachen.gitee.io/vue-element-admin/#/permission/role";
String htmlStr = getUrlHtml(loginUrl,url);
System.out.println("html"+htmlStr);
//將解析url的html內容文件形式寫到當前項目下,方便瀏覽器打開對比頁面爬取效果
FileUtils.writeStringToFile(new File("url內容.html"), htmlStr,"UTF-8");
}
//方案3,獲取js執行後的頁面:PhantomJS
public static String getUrlHtml(String loginUrl,String url) throws IOException {
//準備工作:下載phantomjs.exe:https://npm.taobao.org/mirrors/phantomJS/
String exePath="E:\\phantomjs-windows\\bin\\phantomjs.exe";
Runtime rt = Runtime.getRuntime();
//登入,cookie寫入文件(默認當前目錄)
String cookieJsPath="E:\\phantomjs-windows\\bin\\writeCookie.js";
Process loginP = rt.exec(exePath + " " + cookieJsPath + " " + loginUrl);
loginP.destroy();
//讀取文件cookie,訪問授權頁面
String htmlJsPath="E:\\phantomjs-windows\\bin\\getHtml.js";
Process p = rt.exec(exePath + " " + htmlJsPath + " "+ url);
InputStream is = p.getInputStream();
//is流爲 js中cosole。log內容,
String htmlStr = IOUtils.toString(is, "utf-8");
return htmlStr;
}
//參考地址:
//phantomjs使用說明:https://www.cnblogs.com/Sonet-life/p/5393730.html
//java使用phantomJs抓取動態頁面:https://blog.csdn.net/kaka0930/article/details/68941932
//Java實現HighCharts純後臺圖表生成:https://blog.csdn.net/MAOZEXIJR/article/details/84886104
}