**使用httpclient獲取其他網站數據**
使用httpclient模擬瀏覽器請求網站加載個人訴訟記錄信息接口;
總結:1.系統如果上線,linux系統中使用了python命令來識別驗證碼,先將驗證碼保存在本地,識別完成後刪除;需要一個python腳本,代碼粘下面:
2.如果是在windows系統上運行該系統,提供了一個OCR的封裝類,直接調用即可識別驗證碼;這裏需要使用一個工具類,地址:http://download.csdn.net/download/qq_23339149/9617921
接口類:
import com.alibaba.fastjson.JSONObject;
import com.aweb.platform.util.StringUtils;
import com.dbn.sysmodule.util.IdcardUtils;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Value;
import java.io.*;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by warming on 2016/8/31 with IntelliJ IDEA.
*/
public class OuterInformationServiceImpl implements com.dbn.remote.service.OuterInformationService {
private Logger log = Logger.getLogger(OuterInformationServiceImpl.class);
@Value("${zhiXingIndexUrl}")
private String zhiXingIndexUrl;
@Value("${verificationCodeUrl}")
private String verificationCodeUrl;
@Value("${zhiXingSearchUrl}")
private String zhiXingSearchUrl;
@Value("${zhiXingSearchUserAgent}")
private String zhiXingSearchUserAgent;
public String getPersonLitigationRecords(String pName, String cardNum) throws Exception {
String jsonStr = null;
try {
if (StringUtils.checkStr(pName) && IdcardUtils.validateCard(cardNum)) {
HttpClient client = new HttpClient();
GetMethod method = null;
loadIndex(method, client);//模擬加載首頁
String htmlResponse = getRecords(client, method, pName, cardNum);
int count = 0;
Boolean success = false;
while (count < 5) { //請求次數
if (htmlResponse.contains("驗證碼錯誤")) {
htmlResponse = getRecords(client, method, pName, cardNum);
count++;
} else {
success = true;
break;
}
}
log.info("驗證碼解析錯誤次數:" + count);
if (!success) {
return "查詢失敗";
}
jsonStr = getJsonStrByHtml(htmlResponse);
log.info("查詢結果:" + jsonStr);
}
} catch (Exception e) {
e.printStackTrace();
throw new Exception("查詢個人訴訟記錄錯誤!", e);
}
return jsonStr;
}
private String getRecords(HttpClient client, GetMethod method, String pName, String cardNum) throws Exception {
//加載驗證碼
method = new GetMethod(verificationCodeUrl);
method.addRequestHeader("User-Agent", zhiXingSearchUserAgent);
client.executeMethod(method);
//通過linux調用python命令執行;
String fileName = "/tmp/" + Long.toString(System.currentTimeMillis()) + String.valueOf(getRandom()) + ".jpeg";
FileOutputStream fout = null;
try {
fout = new FileOutputStream(fileName);
fout.write(method.getResponseBody());
}catch(Exception e){
log.info("將驗證碼寫入本地失敗!");
}finally{
if(fout != null){
fout.flush();
fout.close();
}
}
String code = exec(fileName);
log.info("解析驗證碼爲::" + code);
//適用於windows操作系統
// InputStream bis = new ByteArrayInputStream(get.getResponseBody());
// String code = ParseJPEG_withOCR.getRecogniseStr(bis);
// log.info("驗證碼解析結果:" + code);
// bis.close();
PostMethod post = new PostMethod(zhiXingSearchUrl);
post.setRequestHeader("Content-Type", "application/x-www-form-urlencoded");
post.setRequestHeader("Referer", zhiXingIndexUrl);
post.addRequestHeader("User-Agent", zhiXingSearchUserAgent);
post.setRequestBody("searchCourtName=" + URLEncoder.encode("全國法院(包含地方各級法院)") + "&selectCourtId=1&selectCourtArrange=1&pname=" +
URLEncoder.encode(pName) + "&cardNum=" + cardNum + "&j_captcha=" + code);
client.executeMethod(post);
InputStream is = null;
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try{
is = post.getResponseBodyAsStream();
int i = -1;
while ((i = is.read()) != -1) {
baos.write(i);
}
}catch (Exception e){
log.info("獲取查詢返回頁面失敗!");
}finally {
if(is != null){
is.close();
}
}
return baos.toString();
}
//獲取HTML頁面中table標籤對應的json值
private String getJsonStrByHtml(String htmlResponse) throws Exception {
if (StringUtils.checkStr(htmlResponse)) {
JSONObject data = new JSONObject();
org.jsoup.nodes.Document doc = Jsoup.parse(htmlResponse);
Elements trs = doc.getElementsByTag("tr");
Elements ths = doc.getElementsByTag("th");
Elements tds = doc.getElementsByTag("td");
int trsSize = trs.size();//行數
int thsSize = ths.size();//表頭列數
int tdsSize = tds.size();//td數
if (trsSize > 0 && thsSize > 0 && tdsSize > 0) {
List<Object> list = new ArrayList<>();
for (int j = 0; j < trs.size() - 1; j++) {
Map<String, Object> map = new HashMap<>();
for (int i = 0; i < thsSize - 1; i++) {
map.put(ths.get(i).text(), tds.get(thsSize * j + i).text());
}
list.add(map);
}
data.put("data", list);
return data.toJSONString();
} else {
return null;
}
} else {
return null;
}
}
//模擬請求首頁
private void loadIndex(GetMethod get, HttpClient client) throws IOException {
get = new GetMethod(zhiXingIndexUrl);
get.addRequestHeader("User-Agent", zhiXingSearchUserAgent);
client.executeMethod(get);
log.info("首頁加載完成");
}
//調用linux命令
public String exec(String fileName) {
log.info("驗證文件名稱:" + fileName);
try {
String cmd ="python /tmp/captcha.py " + fileName;
Process process = Runtime.getRuntime().exec(cmd);
LineNumberReader br = new LineNumberReader(new InputStreamReader(
process.getInputStream()));
StringBuffer sb = new StringBuffer();
String line;
while ((line = br.readLine()) != null) {
System.out.println(line);
sb.append(line).append("\n");
}
//刪除生成的驗證碼圖片
Runtime.getRuntime().exec("rm -f " + fileName);
return sb.toString();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
//隨機三位數
public int getRandom() {
int number = 0;
while (true) {
number = (int) (Math.random() * 1000);
if (number >= 100 && number < 1000) {
break;
}
}
return number;
}
}
工具類:
import com.asprise.util.ocr.OCR;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
public class ParseJPEG_withOCR {
public static String getRecogniseStr(InputStream imageFile) {
String s = "";
try {
BufferedImage image = ImageIO.read(imageFile);
int width = image.getTileWidth();
int height = image.getTileHeight();
image = image.getSubimage(0, 0, width, height);
s = new OCR().recognizeEverything(image);
} catch (IOException e) {
e.printStackTrace();
System.out.println(" 圖片識別失敗! ");
}
return s;
}
public static String getRecogniseStrByFile(File imageFile) {
String s = "";
try {
BufferedImage image = ImageIO.read(imageFile);
int width = image.getTileWidth();
int height = image.getTileHeight();
image = image.getSubimage(0, 0, width, height);
s = new OCR().recognizeEverything(image);
} catch (IOException e) {
e.printStackTrace();
System.out.println(" 圖片識別失敗! ");
}
return s;
}
public static void main(String[] args) {
// for (int i = 0; i < 100; i++) {
// String code = getRecogniseStrByFile(new File("D:\\pic\\download/" + i + ".jpeg"));
// System.out.println(code);
// }
}
}
python腳本(文件名命名爲:captcha.py),在linux同目錄下保存驗證碼,執行命令:python /tmp/captcha.py ” + fileName 即可返回數據,pytesseract類庫可
百度下載:
from PIL import Image
import sys
import pytesseract
def output(imgfile):
img = Image.open(imgfile)
gray = img.convert('L')
print(pytesseract.image_to_string(gray, config='-psm 7'))
if __name__ == "__main__":
file_name = sys.argv[1]
output(file_name)