爬取URL解析JS後頁面,簡單例子:ChromeDriver,htmlunit,jbrowserdriver,phantomjs

需求:爬取項目中指定url頁面,獲取js執行後的html頁面;

       調整樣式,將此html頁面(需支持highchart圖表,表格,表單等),通過郵件方式發送給用戶查看.

 

測試1:基於java嵌套瀏覽器:JBrowserDriver

package test.mail.demo;

import java.io.File;
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;

import com.machinepublishers.jbrowserdriver.JBrowserDriver;
import com.machinepublishers.jbrowserdriver.Settings;
import com.machinepublishers.jbrowserdriver.Timezone;

public class JBrowserDemoTest {
	/*
	 * 採納原因1:滿足需求;
	 * 採納原因2:代碼量少;java嵌套瀏覽器,項目移植方便;
	 *
	 */
	public static void main(String[] args) throws IOException {
		String loginUrl="https://panjiachen.gitee.io/vue-element-admin/#/login?redirect=%2Fdashboard";
		String url="https://panjiachen.gitee.io/vue-element-admin/#/permission/role";
		String htmlStr = getUrlHtml(loginUrl,url);
		System.out.println("html"+htmlStr);
		//將解析url的html內容文件形式寫到當前項目下,方便瀏覽器打開對比頁面爬取效果
		FileUtils.writeStringToFile(new File("url內容.html"), htmlStr,"UTF-8");
	}
	//方案1,獲取js執行後的頁面:JBrowserDriver
	public static String getUrlHtml(String loginUrl,String url) {
	   JBrowserDriver webDriver = new JBrowserDriver(Settings.builder().
	     timezone(Timezone.AMERICA_NEWYORK).build());
	   //模擬登入
	   //方法1:  webDriver.get(null); webDriver.manage().addCookie(cookie);
	   //方法2:
	   webDriver.get(loginUrl);
	   WebElement username = webDriver.findElement(By.xpath("//*[@name=\"username\"]"));
	   WebElement pwd = webDriver.findElement(By.xpath("//*[@name=\"password\"]"));
	   username.clear();
	   username.sendKeys("admin");
	   pwd.clear();
	   pwd.sendKeys("11111111");
	   webDriver.findElement(By.xpath("//*[@class=\"el-button el-button--primary el-button--medium\"]")).click();
	   //此時已經攜帶cookie,可訪問授權頁面
	   webDriver.get(url);
	   //輸出js執行後的html
	   String htmlStr = webDriver.getPageSource();
	   webDriver.quit();
	   return htmlStr;
	}
	//參考地址:
		//	基於WebKit 無圖形化瀏覽器 jBrowserDriver:https://www.oschina.net/p/jbrowserdriver
		// 	API文檔:http://machinepublishers.github.io/jBrowserDriver/
		//	jBrowserDriver是一個純Java編寫的可編程、嵌入式的,瀏覽器驅動,基於WebKit:https://java.ctolib.com/jbrowserdriver.html
	
	/*pom依賴,(我的項目是局域網,在聯網電腦,新建項目把依賴jar全部拷貝)
	 <dependency>
			<groupId>com.machinepublishers</groupId>
			<artifactId>jbrowserdriver</artifactId>
			<version>1.1.1</version>
		</dependency>
		
	 */
}

 

測試2.基於谷歌無頭瀏覽器:selenium+chromedriver+chrome headless(谷歌無頭瀏覽器,windows谷歌瀏覽器就行)

package test.mail.demo;


import java.io.File;
import java.io.IOException;
import java.util.concurrent.TimeUnit;

import org.apache.commons.io.FileUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;

public class ChromeHeadlessDemoTest {
	/*
	 * 不採用原因1:我的項目環境是Linux環境,局域網,無法聯網下載谷歌無頭瀏覽器;本地環境依賴太多;
	 * 不採用原因2:需求只需要爬取指定url頁面,不需要進一步爬取頁面中的鏈接,此選擇過於笨重;換環境麻煩;
	 * chromedriver_linux64
	 * google-chrome-stable_current_x86_64.rpm
	 * https://dl.google.com/linux/direct/google-chrome-stable_current_x86_64.rpm
	 */
	public static void main(String[] args) throws IOException {
		String loginUrl="https://panjiachen.gitee.io/vue-element-admin/#/login?redirect=%2Fdashboard";
		String url="https://panjiachen.gitee.io/vue-element-admin/#/permission/role";
		String htmlStr = getUrlHtml(loginUrl,url);
		System.out.println("html"+htmlStr);
		//將解析url的html內容文件形式寫到當前項目下,方便瀏覽器打開對比頁面爬取效果
		FileUtils.writeStringToFile(new File("url內容.html"), htmlStr,"UTF-8");
	}
	//方案2,獲取js執行後的頁面:selenium+chromedriver+chrome headless(谷歌無頭瀏覽器,windows谷歌瀏覽器就行)
	public static String getUrlHtml(String loginUrl,String url) {
		//準備工作:下載你使用的谷歌瀏覽器對應版本的chromedriver;地址  https://npm.taobao.org/mirrors/chromedriver/
		//前提:將chromedriver。exe放到你的谷歌瀏覽器安裝目錄(與chrome.exe平級)
	    System.setProperty("webdriver.chrome.driver","C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe");
	    WebDriver webDriver = new ChromeDriver();
	    //模擬登入
		   //方法1:  webDriver.get(null); webDriver.manage().addCookie(cookie);
		   //方法2:
		   webDriver.get(loginUrl);
		   WebElement username = webDriver.findElement(By.xpath("//*[@name=\"username\"]"));
		   WebElement pwd = webDriver.findElement(By.xpath("//*[@name=\"password\"]"));
		   username.clear();
		   username.sendKeys("admin");
		   pwd.clear();
		   pwd.sendKeys("11111111");
		   webDriver.findElement(By.xpath("//*[@class=\"el-button el-button--primary el-button--medium\"]")).click();
		   //此時已經攜帶cookie,可訪問授權頁面
		   webDriver.get(url);
		   //等待1s;
		   try {
			TimeUnit.SECONDS.sleep(1);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		   //輸出js執行後的html
		   String htmlStr = webDriver.getPageSource();
		   webDriver.quit();
		   return htmlStr;
	}
	//參考地址:
		//	java 利用selenium+chromedriver實現爬蟲:https://www.jianshu.com/p/30b60f5da23c
		//selenium+Chromedriver模擬登陸操作:	https://www.jianshu.com/p/308daa2b91c2
	/*
	 *pom依賴
		 <dependency>
		    <groupId>org.seleniumhq.selenium</groupId>
		    <artifactId>selenium-java</artifactId>
		    <version>3.141.59</version>
		</dependency>
	 */
	//linux相關,依賴太多,最終放棄離線安裝
		//linux局域網安裝chrome缺少依賴libappindicator3.so.1及liberation-fonts:https://blog.csdn.net/wangying202/article/details/102565367
		//linuxrpm包下載地址:http://rpmfind.net/
		//	http://rpmfind.net/linux/centos/7.7.1908/os/x86_64/Packages/liberation-fonts-1.07.2-16.el7.noarch.rpm
}

測試3:java嵌套瀏覽器:htmlunit

package test.mail.demo;

import java.io.File;
import java.io.IOException;

import org.apache.commons.io.FileUtils;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class HtmlUnitDemoTest {
	/*
	 * 不採用原因1:爬取highchart圖失敗(svg中with,height:NaN),
	 * 不採用原因2:爬取我們前臺vue項目頁面的表單,樣式margin-left生成不對;
	 * 
	 */
	public static void main(String[] args) throws IOException {
		String loginUrl="https://panjiachen.gitee.io/vue-element-admin/#/login?redirect=%2Fdashboard";
		String url="https://panjiachen.gitee.io/vue-element-admin/#/permission/role";
		String htmlStr = getUrlHtml(loginUrl,url);
		System.out.println("html"+htmlStr);
		//將解析url的html內容文件形式寫到當前項目下,方便瀏覽器打開對比頁面爬取效果
		FileUtils.writeStringToFile(new File("url內容.html"), htmlStr,"UTF-8");
	}
	//方案3,獲取js執行後的頁面:htmlunit
	public static String getUrlHtml(String loginUrl,String url) throws IOException {
		WebClient client=new WebClient(BrowserVersion.CHROME);//設置瀏覽器內核
		client.getCookieManager().setCookiesEnabled(true);//設置cookie是否可用
		client.getOptions().setJavaScriptEnabled(true); // js是否可用
		client.getOptions().setCssEnabled(true); //css ,vue項目需設置爲true,jq項目一般設置false因爲影響運行速度
		client.getOptions().setThrowExceptionOnScriptError(false);
		client.getOptions().setThrowExceptionOnFailingStatusCode(false);
		client.setAjaxController(new NicelyResynchronizingAjaxController()); // ajax設置
		//沒有寫登入
		HtmlPage page = client.getPage(url);
		String htmlStr=page.asXml();
		//關閉
		client.close();
	   return htmlStr;
	}
	//參考地址:
	//	使用htmlunit工具來實現對新浪的模擬登錄獲取cookie操作:https://blog.csdn.net/m0_37300802/article/details/79046034
	//
	/*
	 * pom依賴
	 <dependency>
			<groupId>net.sourceforge.htmlunit</groupId>
			<artifactId>htmlunit</artifactId>
			<version>2.38.0</version>
		</dependency>
	 */
}

測試4:PhantomJS

package test.mail.demo;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;


public class PhantomJSDemoTest {
	/*
	 * 不採用原因1:爬取highchart圖失敗(svg中with,height:NaN),爬取我們前臺vue項目失敗率高;
	 * 不採用原因2:我的項目環境是Linux環境,局域網,無法聯網下載linux版PhantomJS;本地環境依賴太多;
	 * 不採用原因3:需求只需要爬取指定url頁面,不需要進一步爬取頁面中的鏈接,此選擇過於笨重;換環境麻煩;
	 * phantomjs.exe
	 */
	public static void main(String[] args) throws IOException {
		String loginUrl="https://panjiachen.gitee.io/vue-element-admin/#/login?redirect=%2Fdashboard";
		String url="https://panjiachen.gitee.io/vue-element-admin/#/permission/role";
		String htmlStr = getUrlHtml(loginUrl,url);
		System.out.println("html"+htmlStr);
		//將解析url的html內容文件形式寫到當前項目下,方便瀏覽器打開對比頁面爬取效果
		FileUtils.writeStringToFile(new File("url內容.html"), htmlStr,"UTF-8");
	}
	//方案3,獲取js執行後的頁面:PhantomJS
	public static String getUrlHtml(String loginUrl,String url) throws IOException {
		//準備工作:下載phantomjs.exe:https://npm.taobao.org/mirrors/phantomJS/
		String exePath="E:\\phantomjs-windows\\bin\\phantomjs.exe";
		Runtime rt = Runtime.getRuntime();
		//登入,cookie寫入文件(默認當前目錄)
		String cookieJsPath="E:\\phantomjs-windows\\bin\\writeCookie.js";
		Process loginP = rt.exec(exePath + " " + cookieJsPath + " " + loginUrl);
		loginP.destroy();
		//讀取文件cookie,訪問授權頁面
		String htmlJsPath="E:\\phantomjs-windows\\bin\\getHtml.js";
		Process p = rt.exec(exePath + " " + htmlJsPath + " "+ url);
		InputStream is = p.getInputStream();
		//is流爲 js中cosole。log內容,
		String htmlStr = IOUtils.toString(is, "utf-8");
	   return htmlStr;
	}
	//參考地址:
	//phantomjs使用說明:https://www.cnblogs.com/Sonet-life/p/5393730.html
	//java使用phantomJs抓取動態頁面:https://blog.csdn.net/kaka0930/article/details/68941932
	//Java實現HighCharts純後臺圖表生成:https://blog.csdn.net/MAOZEXIJR/article/details/84886104
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章